From 4143b864678dbd3f44179c41970b532ba0f10500 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sat, 6 Feb 2021 18:35:50 +1000 Subject: [PATCH 001/276] Pep8 format (#184) * Format file to be PEP8 compliant * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Format file to PEP8 * Remove unused imports * Format file to PEP8 * Remove unused imports * Condense spacing --- script.py | 183 +++++++--------- src/arguments.py | 113 ++++------ src/config.py | 43 +--- src/downloaders/Direct.py | 12 +- src/downloaders/Erome.py | 96 ++++---- src/downloaders/Gallery.py | 115 ---------- src/downloaders/Gfycat.py | 28 +-- src/downloaders/Imgur.py | 118 +++++----- src/downloaders/downloaderUtils.py | 78 +++---- src/downloaders/gallery.py | 110 ++++++++++ src/downloaders/gifDeliveryNetwork.py | 27 +-- src/downloaders/redgifs.py | 25 +-- src/downloaders/selfPost.py | 17 +- src/downloaders/vreddit.py | 47 ++-- src/downloaders/youtube.py | 19 +- src/errors.py | 3 - src/jsonHelper.py | 30 ++- src/parser.py | 251 +++++++++++---------- src/programMode.py | 189 +++++++--------- src/reddit.py | 36 +-- src/searcher.py | 303 ++++++++++++-------------- src/utils.py | 67 +++--- 22 files changed, 836 insertions(+), 1074 deletions(-) delete mode 100644 src/downloaders/Gallery.py create mode 100644 src/downloaders/gallery.py diff --git a/script.py b/script.py index 6ace7f8..87800b9 100644 --- a/script.py +++ b/script.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -This program downloads imgur, gfycat and direct image and video links of +This program downloads imgur, gfycat and direct image and video links of saved posts from a reddit account. It is written in Python 3. """ import logging @@ -12,26 +12,28 @@ from io import StringIO from pathlib import Path from prawcore.exceptions import InsufficientScope +from src.arguments import Arguments +from src.config import Config from src.downloaders.Direct import Direct from src.downloaders.Erome import Erome +from src.downloaders.gallery import Gallery from src.downloaders.Gfycat import Gfycat +from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork from src.downloaders.Imgur import Imgur from src.downloaders.Gallery import Gallery from src.downloaders.redgifs import Redgifs from src.downloaders.selfPost import SelfPost from src.downloaders.vreddit import VReddit from src.downloaders.youtube import Youtube -from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork -from src.errors import ImgurLimitError, FileAlreadyExistsError, ImgurLoginError, NotADownloadableLinkError, NoSuitablePost, InvalidJSONFile, FailedToDownload, TypeInSkip, DomainInSkip, AlbumNotDownloadedCompletely, full_exc_info -from src.searcher import getPosts -from src.utils import (GLOBAL, createLogFile, nameCorrector, - printToFile) +from src.errors import (AlbumNotDownloadedCompletely, DomainInSkip, FailedToDownload, FileAlreadyExistsError, + ImgurLimitError, ImgurLoginError, InvalidJSONFile, NoSuitablePost, NotADownloadableLinkError, + TypeInSkip, full_exc_info) from src.jsonHelper import JsonFile -from src.config import Config -from src.arguments import Arguments from src.programMode import ProgramMode from src.reddit import Reddit +from src.searcher import getPosts from src.store import Store +from src.utils import GLOBAL, createLogFile, nameCorrector, printToFile from time import sleep @@ -42,12 +44,12 @@ __maintainer__ = "Ali Parlakci" __email__ = "parlakciali@gmail.com" -def postFromLog(fileName): +def postFromLog(filename): """Analyze a log file and return a list of dictionaries containing submissions """ - if Path.is_file(Path(fileName)): - content = JsonFile(fileName).read() + if Path.is_file(Path(filename)): + content = JsonFile(filename).read() else: print("File not found") sys.exit() @@ -60,23 +62,22 @@ def postFromLog(fileName): posts = [] for post in content: - if content[post][-1]['TYPE'] is not None: + if not content[post][-1]['TYPE'] is None: posts.append(content[post][-1]) return posts -def isPostExists(POST, directory): +def isPostExists(post, directory): """Figure out a file's name and checks if the file already exists""" - filename = GLOBAL.config['filename'].format(**POST) + filename = GLOBAL.config['filename'].format(**post) - possibleExtensions = [".jpg", ".png", ".mp4", - ".gif", ".webm", ".md", ".mkv", ".flv"] + possible_extensions = [".jpg", ".png", ".mp4", ".gif", ".webm", ".md", ".mkv", ".flv"] - for extension in possibleExtensions: + for extension in possible_extensions: - path = directory / Path(filename+extension) + path = directory / Path(filename + extension) if path.exists(): return True @@ -84,58 +85,57 @@ def isPostExists(POST, directory): return False -def downloadPost(SUBMISSION, directory): + +def downloadPost(submission, directory): downloaders = { - "imgur":Imgur,"gfycat":Gfycat,"erome":Erome,"direct":Direct,"self":SelfPost, - "redgifs":Redgifs, "gifdeliverynetwork": GifDeliveryNetwork, + "imgur": Imgur, "gfycat": Gfycat, "erome": Erome, "direct": Direct, "self": SelfPost, + "redgifs": Redgifs, "gifdeliverynetwork": GifDeliveryNetwork, "v.redd.it": VReddit, "youtube": Youtube, "gallery": Gallery } print() - if SUBMISSION['TYPE'] in downloaders: - downloaders[SUBMISSION['TYPE']](directory, SUBMISSION) + if submission['TYPE'] in downloaders: + downloaders[submission['TYPE']](directory, submission) else: raise NoSuitablePost + def download(submissions): """Analyze list of submissions and call the right function to download each one, catch errors, update the log files """ - downloadedCount = 0 + downloaded_count = 0 duplicates = 0 - FAILED_FILE = createLogFile("FAILED") + failed_file = createLogFile("FAILED") if GLOBAL.arguments.unsave: reddit = Reddit(GLOBAL.config['credentials']['reddit']).begin() - subsLenght = len(submissions) + subs_length = len(submissions) for i in range(len(submissions)): - print(f"\n({i+1}/{subsLenght})", end=" — ") + print(f"\n({i+1}/{subs_length})", end=" — ") print(submissions[i]['POSTID'], f"r/{submissions[i]['SUBREDDIT']}", f"u/{submissions[i]['REDDITOR']}", submissions[i]['FLAIR'] if submissions[i]['FLAIR'] else "", sep=" — ", end="") - print(f" – {submissions[i]['TYPE'].upper()}", end="", noPrint=True) + print(f" – {submissions[i]['TYPE'].upper()}", end="", no_print=True) directory = GLOBAL.directory / \ GLOBAL.config["folderpath"].format(**submissions[i]) details = { **submissions[i], - **{ - "TITLE": nameCorrector( - submissions[i]['TITLE'], - reference=str(directory) - + GLOBAL.config['filename'].format(**submissions[i]) - + ".ext" - ) - } + **{"TITLE": nameCorrector( + submissions[i]['TITLE'], + reference=str(directory) + + GLOBAL.config['filename'].format(**submissions[i]) + + ".ext")} } filename = GLOBAL.config['filename'].format(**details) @@ -164,11 +164,7 @@ def download(submissions): reddit = Reddit().begin() reddit.submission(id=details['POSTID']).unsave() - if GLOBAL.arguments.download_delay: - print(f"Delaying next download for {GLOBAL.arguments.download_delay} seconds...") - sleep(GLOBAL.arguments.download_delay) - - downloadedCount += 1 + downloaded_count += 1 except FileAlreadyExistsError: print("It already exists") @@ -176,33 +172,18 @@ def download(submissions): duplicates += 1 except ImgurLoginError: - print( - "Imgur login failed. \nQuitting the program " - "as unexpected errors might occur." - ) + print("Imgur login failed. \nQuitting the program as unexpected errors might occur.") sys.exit() except ImgurLimitError as exception: - FAILED_FILE.add({int(i+1): [ - "{class_name}: {info}".format( - class_name=exception.__class__.__name__, info=str( - exception) - ), - details + failed_file.add({int(i + 1): [ + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)), details ]}) except NotADownloadableLinkError as exception: - print( - "{class_name}: {info}".format( - class_name=exception.__class__.__name__, info=str( - exception) - ) - ) - FAILED_FILE.add({int(i+1): [ - "{class_name}: {info}".format( - class_name=exception.__class__.__name__, info=str( - exception) - ), + print("{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))) + failed_file.add({int(i + 1): [ + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)), submissions[i] ]}) @@ -223,60 +204,48 @@ def download(submissions): print("Failed to download the posts, skipping...") except AlbumNotDownloadedCompletely: print("Album did not downloaded completely.") - FAILED_FILE.add({int(i+1): [ - "{class_name}: {info}".format( - class_name=exc.__class__.__name__, info=str(exc) - ), + failed_file.add({int(i + 1): [ + "{class_name}: {info}".format(class_name=exc.__class__.__name__, info=str(exc)), submissions[i] ]}) except Exception as exc: - print( - "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exc.__class__.__name__, info=str(exc) - ) + print("{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( + class_name=exc.__class__.__name__, info=str(exc)) ) - logging.error(sys.exc_info()[0].__name__, - exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue(), noPrint=True) + logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) + print(GLOBAL.log_stream.getvalue(), no_print=True) - FAILED_FILE.add({int(i+1): [ - "{class_name}: {info}".format( - class_name=exc.__class__.__name__, info=str(exc) - ), + failed_file.add({int(i + 1): [ + "{class_name}: {info}".format(class_name=exc.__class__.__name__, info=str(exc)), submissions[i] ]}) if duplicates: - print(f"\nThere {'were' if duplicates > 1 else 'was'} " - f"{duplicates} duplicate{'s' if duplicates > 1 else ''}") + print(f"\nThere {'were' if duplicates > 1 else 'was'} {duplicates} duplicate{'s' if duplicates > 1 else ''}") - if downloadedCount: - print(f"Total of {downloadedCount} " - f"link{'s' if downloadedCount > 1 else ''} downloaded!") + if downloaded_count == 0: + print("Nothing is downloaded :(") else: - print("Nothing is downloaded :(") + print(f"Total of {downloaded_count} link{'s' if downloaded_count > 1 else ''} downloaded!") def printLogo(): + VanillaPrint(f"\nBulk Downloader for Reddit v{__version__}\n" + f"Written by Ali PARLAKCI – parlakciali@gmail.com\n\n" + f"https://github.com/aliparlakci/bulk-downloader-for-reddit/\n" + ) - VanillaPrint( - f"\nBulk Downloader for Reddit v{__version__}\n" - f"Written by Ali PARLAKCI – parlakciali@gmail.com\n\n" - f"https://github.com/aliparlakci/bulk-downloader-for-reddit/\n" - ) def main(): - - if not Path(GLOBAL.defaultConfigDirectory).is_dir(): - os.makedirs(GLOBAL.defaultConfigDirectory) - if Path("config.json").exists(): GLOBAL.configDirectory = Path("config.json") else: - GLOBAL.configDirectory = GLOBAL.defaultConfigDirectory / "config.json" + if not Path(GLOBAL.defaultConfigDirectory).is_dir(): + os.makedirs(GLOBAL.defaultConfigDirectory) + GLOBAL.configDirectory = GLOBAL.defaultConfigDirectory / "config.json" try: GLOBAL.config = Config(GLOBAL.configDirectory).generate() except InvalidJSONFile as exception: @@ -307,6 +276,7 @@ def main(): if arguments.use_local_config: JsonFile("config.json").add(GLOBAL.config) + sys.exit() if arguments.directory: GLOBAL.directory = Path(arguments.directory.strip()) @@ -322,21 +292,20 @@ def main(): GLOBAL.downloadedPosts = Store() printLogo() - print("\n", " ".join(sys.argv), "\n", noPrint=True) + print("\n", " ".join(sys.argv), "\n", no_print=True) if arguments.log is not None: - logDir = Path(arguments.log) - download(postFromLog(logDir)) + log_dir = Path(arguments.log) + download(postFromLog(log_dir)) sys.exit() - programMode = ProgramMode(arguments).generate() + program_mode = ProgramMode(arguments).generate() try: - posts = getPosts(programMode) + posts = getPosts(program_mode) except Exception as exc: - logging.error(sys.exc_info()[0].__name__, - exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue(), noPrint=True) + logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) + print(GLOBAL.log_stream.getvalue(), no_print=True) print(exc) sys.exit() @@ -358,10 +327,7 @@ if __name__ == "__main__": try: VanillaPrint = print print = printToFile - GLOBAL.RUN_TIME = str(time.strftime( - "%d-%m-%Y_%H-%M-%S", - time.localtime(time.time()) - )) + GLOBAL.RUN_TIME = str(time.strftime("%d-%m-%Y_%H-%M-%S", time.localtime(time.time()))) main() except KeyboardInterrupt: @@ -371,9 +337,8 @@ if __name__ == "__main__": except Exception as exception: if GLOBAL.directory is None: GLOBAL.directory = Path("..\\") - logging.error(sys.exc_info()[0].__name__, - exc_info=full_exc_info(sys.exc_info())) + logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) print(GLOBAL.log_stream.getvalue()) - if not GLOBAL.arguments.quit: input("\nPress enter to quit\n") - + if not GLOBAL.arguments.quit: + input("\nPress enter to quit\n") diff --git a/src/arguments.py b/src/arguments.py index 8bdcf9d..cbf72c7 100644 --- a/src/arguments.py +++ b/src/arguments.py @@ -10,12 +10,9 @@ class Arguments: arguments = [] parser = argparse.ArgumentParser(allow_abbrev=False, - description="This program downloads " - "media from reddit " - "posts") + description="This program downloads media from reddit posts") parser.add_argument("--directory", "-d", - help="Specifies the directory where posts will be " - "downloaded to", + help="Specifies the directory where posts will be downloaded to", metavar="DIRECTORY") parser.add_argument("--verbose", "-v", @@ -50,31 +47,26 @@ class Arguments: help="Gets upvoted posts of --user") parser.add_argument("--log", - help="Takes a log file which created by itself " - "(json files), reads posts and tries downloadin" - "g them again.", + help="Takes a log file which created by itself (json files),reads posts and tries " + "downloading them again.", # type=argparse.FileType('r'), metavar="LOG FILE") - parser.add_argument( - "--subreddit", - nargs="+", - help="Triggers subreddit mode and takes subreddit's " - "name without r/. use \"frontpage\" for frontpage", - metavar="SUBREDDIT", - type=str) + parser.add_argument("--subreddit", + nargs="+", + help="Triggers subreddit mode and takes subreddit's name without r/. use \"frontpage\" " + "for frontpage", + metavar="SUBREDDIT", + type=str) parser.add_argument("--multireddit", - help="Triggers multireddit mode and takes " - "multireddit's name without m/", + help="Triggers multireddit mode and takes multireddit's name without m", metavar="MULTIREDDIT", type=str) parser.add_argument("--user", - help="reddit username if needed. use \"me\" for " - "current user", - required="--multireddit" in sys.argv or - "--submitted" in sys.argv, + help="reddit username if needed. use \"me\" for current user", + required="--multireddit" in sys.argv or "--submitted" in sys.argv, metavar="redditor", type=str) @@ -85,12 +77,8 @@ class Arguments: type=str) parser.add_argument("--sort", - help="Either hot, top, new, controversial, rising " - "or relevance default: hot", - choices=[ - "hot", "top", "new", "controversial", "rising", - "relevance" - ], + help="Either hot, top, new, controversial, rising or relevance default: hot", + choices=["hot", "top", "new", "controversial", "rising", "relevance"], metavar="SORT TYPE", type=str) @@ -100,10 +88,8 @@ class Arguments: type=int) parser.add_argument("--time", - help="Either hour, day, week, month, year or all." - " default: all", - choices=["all", "hour", "day", - "week", "month", "year"], + help="Either hour, day, week, month, year or all. default: all", + choices=["all", "hour", "day", "week", "month", "year"], metavar="TIME_LIMIT", type=str) @@ -130,47 +116,38 @@ class Arguments: help="Set custom filename", ) - parser.add_argument( - "--set-default-directory", - action="store_true", - help="Set a default directory to be used in case no directory is given", - ) + parser.add_argument("--set-default-directory", + action="store_true", + help="Set a default directory to be used in case no directory is given", + ) - parser.add_argument( - "--set-default-options", - action="store_true", - help="Set default options to use everytime program runs", - ) + parser.add_argument("--set-default-options", + action="store_true", + help="Set default options to use everytime program runs", + ) - parser.add_argument( - "--use-local-config", - action="store_true", - help="Creates a config file in the program's directory and uses it. Useful for having multiple configs", - ) + parser.add_argument("--use-local-config", + action="store_true", + help="Creates a config file in the program's directory" + " and uses it. Useful for having multiple configs", + ) - parser.add_argument( - "--no-dupes", - action="store_true", - help="Do not download duplicate posts on different subreddits", - ) + parser.add_argument("--no-dupes", + action="store_true", + help="Do not download duplicate posts on different subreddits", + ) - parser.add_argument( - "--downloaded-posts", - help="Use a hash file to keep track of downloaded files", - type=str) + parser.add_argument("--downloaded-posts", + help="Use a hash file to keep track of downloaded files", + type=str + ) - parser.add_argument( - "--no-download", - action="store_true", - help="Just saved posts into a the POSTS.json file without downloading") + parser.add_argument("--no-download", + action="store_true", + help="Just saved posts into a the POSTS.json file without downloading" + ) - parser.add_argument( - "--download-delay", - metavar="DELAY", - type=int, - help="Amount, in seconds, to delay before beginning the next item in the download queue") - - - if arguments == []: + if not arguments: return parser.parse_args() - return parser.parse_args(arguments) + else: + return parser.parse_args(arguments) diff --git a/src/config.py b/src/config.py index 662f91a..3f9f17a 100644 --- a/src/config.py +++ b/src/config.py @@ -1,10 +1,9 @@ - from src.reddit import Reddit from src.jsonHelper import JsonFile from src.utils import nameCorrector -class Config(): +class Config: def __init__(self, filename): self.filename = filename @@ -35,23 +34,17 @@ For example: {FLAIR}_{SUBREDDIT}_{REDDITOR} Existing filename template:""", None if "filename" not in self.file.read() else self.file.read()["filename"]) filename = nameCorrector(input(">> ").upper()) - self.file.add({ - "filename": filename - }) + self.file.add({"filename": filename}) def _readCustomFileName(self): content = self.file.read() if "filename" not in content: - self.file.add({ - "filename": "{REDDITOR}_{TITLE}_{POSTID}" - }) + self.file.add({"filename": "{REDDITOR}_{TITLE}_{POSTID}"}) content = self.file.read() - if not "{POSTID}" in content["filename"]: - self.file.add({ - "filename": content["filename"] + "_{POSTID}" - }) + if "{POSTID}" not in content["filename"]: + self.file.add({"filename": content["filename"] + "_{POSTID}"}) def setCustomFolderPath(self): print(""" @@ -68,16 +61,12 @@ Existing folder structure""", None if "folderpath" not in self.file.read() else folderpath = nameCorrector(input(">> ").strip("\\").strip("/").upper()) - self.file.add({ - "folderpath": folderpath - }) + self.file.add({"folderpath": folderpath}) def _readCustomFolderPath(self, path=None): content = self.file.read() if "folderpath" not in content: - self.file.add({ - "folderpath": "{SUBREDDIT}" - }) + self.file.add({"folderpath": "{SUBREDDIT}"}) def setDefaultOptions(self): print(""" @@ -89,33 +78,25 @@ Existing default options:""", None if "options" not in self.file.read() else sel options = input(">> ").strip("") - self.file.add({ - "options": options - }) + self.file.add({"options": options}) def _readDefaultOptions(self, path=None): content = self.file.read() if "options" not in content: - self.file.add({ - "options": "" - }) + self.file.add({"options": ""}) def _validateCredentials(self): """Read credentials from config.json file""" - try: content = self.file.read()["credentials"] except BaseException: - self.file.add({ - "credentials": {} - }) + self.file.add({"credentials": {}}) content = self.file.read()["credentials"] if "reddit" in content and len(content["reddit"]) != 0: pass else: Reddit().begin() - print() def setDefaultDirectory(self): @@ -125,6 +106,4 @@ For example: D:/archive/BDFR_{time} """) print("Current default directory:", self.file.read()[ "default_directory"] if "default_directory" in self.file.read() else "") - self.file.add({ - "default_directory": input(">> ") - }) + self.file.add({"default_directory": input(">> ")}) diff --git a/src/downloaders/Direct.py b/src/downloaders/Direct.py index e22c8a3..44bbe61 100644 --- a/src/downloaders/Direct.py +++ b/src/downloaders/Direct.py @@ -1,16 +1,16 @@ import os -from src.downloaders.downloaderUtils import getFile, getExtension +from src.downloaders.downloaderUtils import getExtension, getFile from src.utils import GLOBAL class Direct: - def __init__(self, directory, POST): - POST['EXTENSION'] = getExtension(POST['CONTENTURL']) + def __init__(self, directory, post): + post['EXTENSION'] = getExtension(post['CONTENTURL']) if not os.path.exists(directory): os.makedirs(directory) - filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"] - shortFilename = POST['POSTID'] + POST['EXTENSION'] + filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] + short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, shortFilename, directory, POST['CONTENTURL']) + getFile(filename, short_filename, directory, post['CONTENTURL']) diff --git a/src/downloaders/Erome.py b/src/downloaders/Erome.py index 4c7ef5c..9283131 100644 --- a/src/downloaders/Erome.py +++ b/src/downloaders/Erome.py @@ -1,12 +1,10 @@ import os +import urllib.error import urllib.request from html.parser import HTMLParser -from src.downloaders.downloaderUtils import getFile -from src.downloaders.downloaderUtils import getExtension - -from src.errors import (AlbumNotDownloadedCompletely, - NotADownloadableLinkError, FileAlreadyExistsError) +from src.downloaders.downloaderUtils import getExtension, getFile +from src.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError from src.utils import GLOBAL from src.utils import printToFile as print @@ -14,88 +12,77 @@ from src.utils import printToFile as print class Erome: def __init__(self, directory, post): try: - IMAGES = self.getLinks(post['CONTENTURL']) + images = self.getLinks(post['CONTENTURL']) except urllib.error.HTTPError: raise NotADownloadableLinkError("Not a downloadable link") - imagesLenght = len(IMAGES) - howManyDownloaded = imagesLenght + images_length = len(images) + how_many_downloaded = images_length duplicates = 0 - if imagesLenght == 1: - - extension = getExtension(IMAGES[0]) + if images_length == 1: + extension = getExtension(images[0]) """Filenames are declared here""" + filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] + short_filename = post['POSTID'] + extension - filename = GLOBAL.config['filename'].format( - **post) + post["EXTENSION"] - shortFilename = post['POSTID'] + extension + image_url = images[0] + if 'https://' not in image_url or 'http://' not in image_url: + image_url = "https://" + image_url - imageURL = IMAGES[0] - if 'https://' not in imageURL or 'http://' not in imageURL: - imageURL = "https://" + imageURL - - getFile(filename, shortFilename, directory, imageURL) + getFile(filename, short_filename, directory, image_url) else: filename = GLOBAL.config['filename'].format(**post) - print(filename) - folderDir = directory / filename + folder_dir = directory / filename try: - if not os.path.exists(folderDir): - os.makedirs(folderDir) + if not os.path.exists(folder_dir): + os.makedirs(folder_dir) except FileNotFoundError: - folderDir = directory / post['POSTID'] - os.makedirs(folderDir) + folder_dir = directory / post['POSTID'] + os.makedirs(folder_dir) - for i in range(imagesLenght): - - extension = getExtension(IMAGES[i]) + for i in range(images_length): + extension = getExtension(images[i]) filename = str(i + 1) + extension - imageURL = IMAGES[i] - if 'https://' not in imageURL and 'http://' not in imageURL: - imageURL = "https://" + imageURL + image_url = images[i] + if 'https://' not in image_url and 'http://' not in image_url: + image_url = "https://" + image_url - print(" ({}/{})".format(i + 1, imagesLenght)) + print(" ({}/{})".format(i + 1, images_length)) print(" {}".format(filename)) try: - getFile(filename, filename, folderDir, imageURL, indent=2) + getFile(filename, filename, folder_dir, image_url, indent=2) print() except FileAlreadyExistsError: print(" The file already exists" + " " * 10, end="\n\n") duplicates += 1 - howManyDownloaded -= 1 + how_many_downloaded -= 1 except Exception as exception: # raise exception print("\n Could not get the file") print( " " - + "{class_name}: {info}".format( - class_name=exception.__class__.__name__, - info=str(exception) - ) + + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)) + "\n" ) - howManyDownloaded -= 1 + how_many_downloaded -= 1 - if duplicates == imagesLenght: + if duplicates == images_length: raise FileAlreadyExistsError - if howManyDownloaded + duplicates < imagesLenght: - raise AlbumNotDownloadedCompletely( - "Album Not Downloaded Completely" - ) - - def getLinks(self, url, lineNumber=129): + elif how_many_downloaded + duplicates < images_length: + raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") + def getLinks(self, url): content = [] - lineNumber = None + line_number = None class EromeParser(HTMLParser): tag = None @@ -103,22 +90,22 @@ class Erome: def handle_starttag(self, tag, attrs): self.tag = {tag: {attr[0]: attr[1] for attr in attrs}} - pageSource = (urllib.request.urlopen(url).read().decode().split('\n')) + page_source = (urllib.request.urlopen(url).read().decode().split('\n')) """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS""" - for i in range(len(pageSource)): + for i in range(len(page_source)): obj = EromeParser() - obj.feed(pageSource[i]) + obj.feed(page_source[i]) tag = obj.tag if tag is not None: if "div" in tag: if "id" in tag["div"]: if tag["div"]["id"] == "album": - lineNumber = i + line_number = i break - for line in pageSource[lineNumber:]: + for line in page_source[line_number:]: obj = EromeParser() obj.feed(line) tag = obj.tag @@ -130,7 +117,4 @@ class Erome: elif "source" in tag: content.append(tag["source"]["src"]) - return [ - link for link in content - if link.endswith("_480p.mp4") or not link.endswith(".mp4") - ] + return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")] diff --git a/src/downloaders/Gallery.py b/src/downloaders/Gallery.py deleted file mode 100644 index cef73f1..0000000 --- a/src/downloaders/Gallery.py +++ /dev/null @@ -1,115 +0,0 @@ -import os -import json -import urllib -import requests - -from src.utils import GLOBAL -from src.utils import printToFile as print -from src.downloaders.downloaderUtils import getFile -from src.errors import FileNotFoundError, FileAlreadyExistsError, AlbumNotDownloadedCompletely, ImageNotFound, NotADownloadableLinkError, TypeInSkip - - -class Gallery: - def __init__(self, directory, post): - - links = post['CONTENTURL'] - - images = {} - count = 0 - for link in links: - path = urllib.parse.urlparse(link).path - base = os.path.basename(path) - name = os.path.splitext(base)[0] - images[count] = {'id': name, 'url': link} - count = count + 1 - - self.directory = directory - self.post = post - - self.downloadAlbum(images, count) - - @staticmethod - def getData(link): - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", - } - res = requests.get(link, headers=headers) - if res.status_code != 200: - raise ImageNotFound( - f"Server responded with {res.status_code} to {link}") - pageSource = res.text - - STARTING_STRING = "_r = {" - ENDING_STRING = "" - - STARTING_STRING_LENGHT = len(STARTING_STRING) - try: - startIndex = pageSource.index( - STARTING_STRING) + STARTING_STRING_LENGHT - endIndex = pageSource.index(ENDING_STRING, startIndex) - except ValueError: - raise NotADownloadableLinkError( - f"Could not read the page source on {link}") - - data = json.loads(pageSource[startIndex - 1:endIndex + 1].strip()[:-1]) - return data - - def downloadAlbum(self, images, count): - folderName = GLOBAL.config['filename'].format(**self.post) - folderDir = self.directory / folderName - - howManyDownloaded = 0 - duplicates = 0 - - try: - if not os.path.exists(folderDir): - os.makedirs(folderDir) - except FileNotFoundError: - folderDir = self.directory / self.post['POSTID'] - os.makedirs(folderDir) - - print(folderName) - - for i in range(count): - path = urllib.parse.urlparse(images[i]['url']).path - extension = os.path.splitext(path)[1] - - filename = "_".join([ - str(i + 1), images[i]['id'] - ]) + extension - shortFilename = str(i + 1) + "_" + images[i]['id'] - - print("\n ({}/{})".format(i + 1, count)) - - try: - getFile(filename, shortFilename, folderDir, - images[i]['url'], indent=2) - howManyDownloaded += 1 - print() - - except FileAlreadyExistsError: - print(" The file already exists" + " " * 10, end="\n\n") - duplicates += 1 - - except TypeInSkip: - print(" Skipping...") - howManyDownloaded += 1 - - except Exception as exception: - print("\n Could not get the file") - print( - " " + - "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exception.__class__.__name__, - info=str(exception)) + - "\n") - print(GLOBAL.log_stream.getvalue(), noPrint=True) - - if duplicates == count: - raise FileAlreadyExistsError - if howManyDownloaded + duplicates < count: - raise AlbumNotDownloadedCompletely( - "Album Not Downloaded Completely" - ) diff --git a/src/downloaders/Gfycat.py b/src/downloaders/Gfycat.py index 6366329..3cea2c2 100644 --- a/src/downloaders/Gfycat.py +++ b/src/downloaders/Gfycat.py @@ -1,37 +1,38 @@ import json import os import urllib.request + from bs4 import BeautifulSoup -from src.downloaders.downloaderUtils import getFile, getExtension -from src.errors import (NotADownloadableLinkError) -from src.utils import GLOBAL +from src.downloaders.downloaderUtils import getExtension, getFile from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork +from src.errors import NotADownloadableLinkError +from src.utils import GLOBAL + class Gfycat: - def __init__(self, directory, POST): + def __init__(self, directory, post): try: - POST['MEDIAURL'] = self.getLink(POST['CONTENTURL']) + post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - POST['EXTENSION'] = getExtension(POST['MEDIAURL']) + post['EXTENSION'] = getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) - filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"] - shortFilename = POST['POSTID'] + POST['EXTENSION'] + filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] + short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, shortFilename, directory, POST['MEDIAURL']) + getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url): """Extract direct link to the video from page's source and return it """ - if '.webm' in url or '.mp4' in url or '.gif' in url: return url @@ -40,11 +41,10 @@ class Gfycat: url = "https://gfycat.com/" + url.split('/')[-1] - pageSource = (urllib.request.urlopen(url).read().decode()) + page_source = (urllib.request.urlopen(url).read().decode()) - soup = BeautifulSoup(pageSource, "html.parser") - attributes = {"data-react-helmet": "true", - "type": "application/ld+json"} + soup = BeautifulSoup(page_source, "html.parser") + attributes = {"data-react-helmet": "true", "type": "application/ld+json"} content = soup.find("script", attrs=attributes) if content is None: diff --git a/src/downloaders/Imgur.py b/src/downloaders/Imgur.py index 5a38cde..239fc2d 100644 --- a/src/downloaders/Imgur.py +++ b/src/downloaders/Imgur.py @@ -2,19 +2,19 @@ import json import os import requests -from src.utils import GLOBAL, nameCorrector -from src.utils import printToFile as print from src.downloaders.Direct import Direct from src.downloaders.downloaderUtils import getFile -from src.errors import FileNotFoundError, FileAlreadyExistsError, AlbumNotDownloadedCompletely, ImageNotFound, ExtensionError, NotADownloadableLinkError, TypeInSkip +from src.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, + NotADownloadableLinkError, TypeInSkip) +from src.utils import GLOBAL, nameCorrector +from src.utils import printToFile as print class Imgur: - IMGUR_IMAGE_DOMAIN = "https://i.imgur.com/" + imgur_image_domain = "https://i.imgur.com/" def __init__(self, directory, post): - link = post['CONTENTURL'] if link.endswith(".gifv"): @@ -22,53 +22,49 @@ class Imgur: Direct(directory, {**post, 'CONTENTURL': link}) return None - self.rawData = self.getData(link) + self.raw_data = self.getData(link) self.directory = directory self.post = post if self.isAlbum: - if self.rawData["album_images"]["count"] != 1: - self.downloadAlbum(self.rawData["album_images"]) + if self.raw_data["album_images"]["count"] != 1: + self.downloadAlbum(self.raw_data["album_images"]) else: - self.download(self.rawData["album_images"]["images"][0]) + self.download(self.raw_data["album_images"]["images"][0]) else: - self.download(self.rawData) + self.download(self.raw_data) def downloadAlbum(self, images): - folderName = GLOBAL.config['filename'].format(**self.post) - folderDir = self.directory / folderName + folder_name = GLOBAL.config['filename'].format(**self.post) + folder_dir = self.directory / folder_name - imagesLenght = images["count"] - howManyDownloaded = 0 + images_length = images["count"] + how_many_downloaded = 0 duplicates = 0 try: - if not os.path.exists(folderDir): - os.makedirs(folderDir) + if not os.path.exists(folder_dir): + os.makedirs(folder_dir) except FileNotFoundError: - folderDir = self.directory / self.post['POSTID'] - os.makedirs(folderDir) + folder_dir = self.directory / self.post['POSTID'] + os.makedirs(folder_dir) - print(folderName) - - for i in range(imagesLenght): + print(folder_name) + for i in range(images_length): extension = self.validateExtension(images["images"][i]["ext"]) - - imageURL = self.IMGUR_IMAGE_DOMAIN + \ - images["images"][i]["hash"] + extension - + image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension filename = "_".join([str(i + 1), nameCorrector(images["images"][i]['title']), images["images"][i]['hash']]) + extension - shortFilename = str(i + 1) + "_" + images["images"][i]['hash'] + short_filename = str(i + 1) + "_" + images["images"][i]['hash'] - print("\n ({}/{})".format(i + 1, imagesLenght)) + print("\n ({}/{})".format(i + 1, images_length)) try: - getFile(filename, shortFilename, folderDir, imageURL, indent=2) - howManyDownloaded += 1 + getFile(filename, short_filename, folder_dir, image_url, indent=2) + how_many_downloaded += 1 print() except FileAlreadyExistsError: @@ -77,7 +73,7 @@ class Imgur: except TypeInSkip: print(" Skipping...") - howManyDownloaded += 1 + how_many_downloaded += 1 except Exception as exception: print("\n Could not get the file") @@ -85,69 +81,65 @@ class Imgur: " " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( class_name=exception.__class__.__name__, - info=str(exception)) + - "\n") - print(GLOBAL.log_stream.getvalue(), noPrint=True) + info=str(exception) + ) + + "\n" + ) + print(GLOBAL.log_stream.getvalue(), no_print=True) - if duplicates == imagesLenght: + if duplicates == images_length: raise FileAlreadyExistsError - if howManyDownloaded + duplicates < imagesLenght: - raise AlbumNotDownloadedCompletely( - "Album Not Downloaded Completely" - ) + elif how_many_downloaded + duplicates < images_length: + raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") def download(self, image): extension = self.validateExtension(image["ext"]) - imageURL = self.IMGUR_IMAGE_DOMAIN + image["hash"] + extension + image_url = self.imgur_image_domain + image["hash"] + extension filename = GLOBAL.config['filename'].format(**self.post) + extension - shortFilename = self.post['POSTID'] + extension + short_filename = self.post['POSTID'] + extension - getFile(filename, shortFilename, self.directory, imageURL) + getFile(filename, short_filename, self.directory, image_url) @property def isAlbum(self): - return "album_images" in self.rawData + return "album_images" in self.raw_data @staticmethod def getData(link): - cookies = {"over18": "1", "postpagebeta": "0"} res = requests.get(link, cookies=cookies) if res.status_code != 200: - raise ImageNotFound( - f"Server responded with {res.status_code} to {link}") - pageSource = requests.get(link, cookies=cookies).text + raise ImageNotFound(f"Server responded with {res.status_code} to {link}") + page_source = requests.get(link, cookies=cookies).text - STARTING_STRING = "image : " - ENDING_STRING = "group :" + starting_string = "image : " + ending_string = "group :" - STARTING_STRING_LENGHT = len(STARTING_STRING) + starting_string_lenght = len(starting_string) try: - startIndex = pageSource.index( - STARTING_STRING) + STARTING_STRING_LENGHT - endIndex = pageSource.index(ENDING_STRING, startIndex) + start_index = page_source.index(starting_string) + starting_string_lenght + end_index = page_source.index(ending_string, start_index) except ValueError: raise NotADownloadableLinkError( f"Could not read the page source on {link}") - while pageSource[endIndex] != "}": - endIndex = endIndex - 1 + while page_source[end_index] != "}": + end_index -= 1 try: - data = pageSource[startIndex:endIndex + 2].strip()[:-1] - except BaseException: - pageSource[endIndex + 1] = '}' - data = pageSource[startIndex:endIndex + 3].strip()[:-1] + data = page_source[start_index:end_index + 2].strip()[:-1] + except Exception: + page_source[end_index + 1] = '}' + data = page_source[start_index:end_index + 3].strip()[:-1] return json.loads(data) @staticmethod def validateExtension(string): - POSSIBLE_EXTENSIONS = [".jpg", ".png", ".mp4", ".gif"] + possible_extensions = [".jpg", ".png", ".mp4", ".gif"] - for extension in POSSIBLE_EXTENSIONS: + for extension in possible_extensions: if extension in string: return extension - - raise ExtensionError( - f"\"{string}\" is not recognized as a valid extension.") + else: + raise ExtensionError(f"\"{string}\" is not recognized as a valid extension.") diff --git a/src/downloaders/downloaderUtils.py b/src/downloaders/downloaderUtils.py index 3bd4605..110e971 100644 --- a/src/downloaders/downloaderUtils.py +++ b/src/downloaders/downloaderUtils.py @@ -1,21 +1,20 @@ -import sys +import hashlib import os +import sys import urllib.request from pathlib import Path -import hashlib +from src.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip from src.utils import GLOBAL from src.utils import printToFile as print -from src.errors import FileAlreadyExistsError, FailedToDownload, TypeInSkip, DomainInSkip -def dlProgress(count, blockSize, totalSize): +def dlProgress(count, block_size, total_size): """Function for writing download progress to console """ - - downloadedMbs = int(count * blockSize * (10**(-6))) - fileSize = int(totalSize * (10**(-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs, fileSize)) + download_mbs = int(count * block_size * (10 ** (-6))) + file_size = int(total_size * (10 ** (-6))) + sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) sys.stdout.flush() @@ -23,39 +22,32 @@ def getExtension(link): """Extract file extension from image link. If didn't find any, return '.jpg' """ - - imageTypes = ['jpg', 'png', 'mp4', 'webm', 'gif'] + image_types = ['jpg', 'png', 'mp4', 'webm', 'gif'] parsed = link.split('.') - for fileType in imageTypes: + for fileType in image_types: if fileType in parsed: return "." + parsed[-1] - - if "v.redd.it" not in link: - return '.jpg' - return '.mp4' + else: + if "v.redd.it" not in link: + return '.jpg' + else: + return '.mp4' -def getFile( - filename, - shortFilename, - folderDir, - imageURL, - indent=0, - silent=False): - - FORMATS = { +def getFile(filename, short_filename, folder_dir, image_url, indent=0, silent=False): + formats = { "videos": [".mp4", ".webm"], "images": [".jpg", ".jpeg", ".png", ".bmp"], "gifs": [".gif"], "self": [] } - for type in GLOBAL.arguments.skip: - for extension in FORMATS[type]: + for file_type in GLOBAL.arguments.skip: + for extension in formats[file_type]: if extension in filename: raise TypeInSkip - if any(domain in imageURL for domain in GLOBAL.arguments.skip_domain): + if any(domain in image_url for domain in GLOBAL.arguments.skip_domain): raise DomainInSkip headers = [ @@ -70,44 +62,40 @@ def getFile( ("Connection", "keep-alive") ] - if not os.path.exists(folderDir): - os.makedirs(folderDir) + if not os.path.exists(folder_dir): + os.makedirs(folder_dir) opener = urllib.request.build_opener() - if "imgur" not in imageURL: + if "imgur" not in image_url: opener.addheaders = headers urllib.request.install_opener(opener) if not silent: - print(" " * indent + str(folderDir), - " " * indent + str(filename), - sep="\n") + print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") for i in range(3): - fileDir = Path(folderDir) / filename - tempDir = Path(folderDir) / (filename + ".tmp") + file_dir = Path(folder_dir) / filename + temp_dir = Path(folder_dir) / (filename + ".tmp") - if not (os.path.isfile(fileDir)): + if not (os.path.isfile(file_dir)): try: - urllib.request.urlretrieve(imageURL, - tempDir, - reporthook=dlProgress) + urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress) - fileHash = createHash(tempDir) + file_hash = createHash(temp_dir) if GLOBAL.arguments.no_dupes: - if fileHash in GLOBAL.downloadedPosts(): - os.remove(tempDir) + if file_hash in GLOBAL.downloadedPosts(): + os.remove(temp_dir) raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(fileHash) + GLOBAL.downloadedPosts.add(file_hash) - os.rename(tempDir, fileDir) + os.rename(temp_dir, file_dir) if not silent: print(" " * indent + "Downloaded" + " " * 10) return None except ConnectionResetError: raise FailedToDownload except FileNotFoundError: - filename = shortFilename + filename = short_filename else: raise FileAlreadyExistsError raise FailedToDownload diff --git a/src/downloaders/gallery.py b/src/downloaders/gallery.py new file mode 100644 index 0000000..d5cbac4 --- /dev/null +++ b/src/downloaders/gallery.py @@ -0,0 +1,110 @@ +import json +import os +import urllib + +import requests + +from src.downloaders.downloaderUtils import getFile +from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, + TypeInSkip) +from src.utils import GLOBAL +from src.utils import printToFile as print + + +class Gallery: + def __init__(self, directory, post): + link = post['CONTENTURL'] + self.raw_data = self.getData(link) + + self.directory = directory + self.post = post + + images = {} + count = 0 + for model in self.raw_data['posts']['models']: + try: + for item in self.raw_data['posts']['models'][model]['media']['gallery']['items']: + try: + images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts'] + ['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']} + count += 1 + except Exception: + continue + except Exception: + continue + + self.downloadAlbum(images, count) + + @staticmethod + def getData(link): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + } + res = requests.get(link, headers=headers) + if res.status_code != 200: + raise ImageNotFound(f"Server responded with {res.status_code} to {link}") + page_source = res.text + + starting_string = "_r = {" + ending_string = "" + + starting_string_lenght = len(starting_string) + try: + start_index = page_source.index(starting_string) + starting_string_lenght + end_index = page_source.index(ending_string, start_index) + except ValueError: + raise NotADownloadableLinkError(f"Could not read the page source on {link}") + + data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1]) + return data + + def downloadAlbum(self, images, count): + folder_name = GLOBAL.config['filename'].format(**self.post) + folder_dir = self.directory / folder_name + + how_many_downloaded = 0 + duplicates = 0 + + try: + if not os.path.exists(folder_dir): + os.makedirs(folder_dir) + except FileNotFoundError: + folder_dir = self.directory / self.post['POSTID'] + os.makedirs(folder_dir) + + print(folder_name) + + for i in range(count): + path = urllib.parse.urlparse(images[i]['url']).path + extension = os.path.splitext(path)[1] + + filename = "_".join([str(i + 1), images[i]['id']]) + extension + short_filename = str(i + 1) + "_" + images[i]['id'] + + print("\n ({}/{})".format(i + 1, count)) + + try: + getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2) + how_many_downloaded += 1 + print() + + except FileAlreadyExistsError: + print(" The file already exists" + " " * 10, end="\n\n") + duplicates += 1 + + except TypeInSkip: + print(" Skipping...") + how_many_downloaded += 1 + + except Exception as exception: + print("\n Could not get the file") + print(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( + class_name=exception.__class__.__name__, info=str(exception)) + "\n" + ) + print(GLOBAL.log_stream.getvalue(), no_print=True) + + if duplicates == count: + raise FileAlreadyExistsError + elif how_many_downloaded + duplicates < count: + raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") diff --git a/src/downloaders/gifDeliveryNetwork.py b/src/downloaders/gifDeliveryNetwork.py index bd6002b..a41b7ea 100644 --- a/src/downloaders/gifDeliveryNetwork.py +++ b/src/downloaders/gifDeliveryNetwork.py @@ -1,52 +1,49 @@ import os import urllib.request + from bs4 import BeautifulSoup -from src.downloaders.downloaderUtils import getFile, getExtension -from src.errors import (NotADownloadableLinkError) +from src.downloaders.downloaderUtils import getExtension, getFile +from src.errors import NotADownloadableLinkError from src.utils import GLOBAL class GifDeliveryNetwork: - def __init__(self, directory, POST): + def __init__(self, directory, post): try: - POST['MEDIAURL'] = self.getLink(POST['CONTENTURL']) + post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - POST['EXTENSION'] = getExtension(POST['MEDIAURL']) + post['EXTENSION'] = getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) - filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"] - shortFilename = POST['POSTID'] + POST['EXTENSION'] + filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] + short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, shortFilename, directory, POST['MEDIAURL']) + getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url): """Extract direct link to the video from page's source and return it """ - - if '.webm' in url.split( - '/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]: + if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]: return url if url[-1:] == '/': url = url[:-1] url = "https://www.gifdeliverynetwork.com/" + url.split('/')[-1] + page_source = (urllib.request.urlopen(url).read().decode()) - pageSource = (urllib.request.urlopen(url).read().decode()) - - soup = BeautifulSoup(pageSource, "html.parser") + soup = BeautifulSoup(page_source, "html.parser") attributes = {"id": "mp4Source", "type": "video/mp4"} content = soup.find("source", attrs=attributes) if content is None: - raise NotADownloadableLinkError("Could not read the page source") return content["src"] diff --git a/src/downloaders/redgifs.py b/src/downloaders/redgifs.py index f87631b..b12c17d 100644 --- a/src/downloaders/redgifs.py +++ b/src/downloaders/redgifs.py @@ -1,36 +1,36 @@ import json import os import urllib.request + from bs4 import BeautifulSoup -from src.downloaders.downloaderUtils import getFile, getExtension -from src.errors import (NotADownloadableLinkError) +from src.downloaders.downloaderUtils import getExtension, getFile +from src.errors import NotADownloadableLinkError from src.utils import GLOBAL class Redgifs: - def __init__(self, directory, POST): + def __init__(self, directory, post): try: - POST['MEDIAURL'] = self.getLink(POST['CONTENTURL']) + post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - POST['EXTENSION'] = getExtension(POST['MEDIAURL']) + post['EXTENSION'] = getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) - filename = GLOBAL.config['filename'].format(**POST) + POST["EXTENSION"] - shortFilename = POST['POSTID'] + POST['EXTENSION'] + filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] + short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, shortFilename, directory, POST['MEDIAURL']) + getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url): """Extract direct link to the video from page's source and return it """ - if '.webm' in url or '.mp4' in url or '.gif' in url: return url @@ -44,11 +44,10 @@ class Redgifs: 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64') - pageSource = (urllib.request.urlopen(url).read().decode()) + page_source = (urllib.request.urlopen(url).read().decode()) - soup = BeautifulSoup(pageSource, "html.parser") - attributes = {"data-react-helmet": "true", - "type": "application/ld+json"} + soup = BeautifulSoup(page_source, "html.parser") + attributes = {"data-react-helmet": "true", "type": "application/ld+json"} content = soup.find("script", attrs=attributes) if content is None: diff --git a/src/downloaders/selfPost.py b/src/downloaders/selfPost.py index f7a8a47..bae8b88 100644 --- a/src/downloaders/selfPost.py +++ b/src/downloaders/selfPost.py @@ -5,13 +5,13 @@ from pathlib import Path from src.errors import FileAlreadyExistsError, TypeInSkip from src.utils import GLOBAL +from src.utils import printToFile as print VanillaPrint = print class SelfPost: def __init__(self, directory, post): - if "self" in GLOBAL.arguments.skip: raise TypeInSkip @@ -20,20 +20,20 @@ class SelfPost: filename = GLOBAL.config['filename'].format(**post) - fileDir = directory / (filename + ".md") - print(fileDir) + file_dir = directory / (filename + ".md") + print(file_dir) print(filename + ".md") - if Path.is_file(fileDir): + if Path.is_file(file_dir): raise FileAlreadyExistsError try: - self.writeToFile(fileDir, post) + self.writeToFile(file_dir, post) except FileNotFoundError: - fileDir = post['POSTID'] + ".md" - fileDir = directory / fileDir + file_dir = post['POSTID'] + ".md" + file_dir = directory / file_dir - self.writeToFile(fileDir, post) + self.writeToFile(file_dir, post) @staticmethod def writeToFile(directory, post): @@ -57,5 +57,4 @@ class SelfPost: with io.open(directory, "w", encoding="utf-8") as FILE: VanillaPrint(content, file=FILE) - print("Downloaded") diff --git a/src/downloaders/vreddit.py b/src/downloaders/vreddit.py index 7194042..16f5296 100644 --- a/src/downloaders/vreddit.py +++ b/src/downloaders/vreddit.py @@ -13,45 +13,40 @@ class VReddit: os.makedirs(directory) filename = GLOBAL.config['filename'].format(**post) + extension - shortFilename = post['POSTID'] + extension + short_filename = post['POSTID'] + extension try: - FNULL = open(os.devnull, 'w') - subprocess.call("ffmpeg", stdout=FNULL, stderr=subprocess.STDOUT) - except BaseException: - getFile(filename, shortFilename, directory, post['CONTENTURL']) + fnull = open(os.devnull, 'w') + subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) + except Exception: + getFile(filename, short_filename, directory, post['CONTENTURL']) print("FFMPEG library not found, skipping merging video and audio") else: - videoName = post['POSTID'] + "_video" - videoURL = post['CONTENTURL'] - audioName = post['POSTID'] + "_audio" - audioURL = videoURL[:videoURL.rfind('/')] + '/DASH_audio.mp4' + video_name = post['POSTID'] + "_video" + video_url = post['CONTENTURL'] + audio_name = post['POSTID'] + "_audio" + audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' print(directory, filename, sep="\n") - getFile(videoName, videoName, directory, videoURL, silent=True) - getFile(audioName, audioName, directory, audioURL, silent=True) + getFile(video_name, video_name, directory, video_url, silent=True) + getFile(audio_name, audio_name, directory, audio_url, silent=True) try: - self._mergeAudio(videoName, - audioName, - filename, - shortFilename, - directory) + self._mergeAudio(video_name, audio_name, filename, short_filename, directory) except KeyboardInterrupt: os.remove(directory / filename) - os.remove(directory / audioName) - - os.rename(directory / videoName, directory / filename) + os.remove(directory / audio_name) + os.rename(directory / video_name, directory / filename) @staticmethod - def _mergeAudio(video, audio, filename, shortFilename, directory): + def _mergeAudio(video, audio, filename, short_filename, directory): + input_video = str(directory / video) + input_audio = str(directory / audio) - inputVideo = str(directory / video) - inputAudio = str(directory / audio) - - FNULL = open(os.devnull, 'w') - cmd = f"ffmpeg -i {inputAudio} -i {inputVideo} -c:v copy -c:a aac -strict experimental {str(directory / filename)}" - subprocess.call(cmd.split(), stdout=FNULL, stderr=subprocess.STDOUT) + fnull = open(os.devnull, 'w') + cmd = "ffmpeg -i {} -i {} -c:v copy -c:a aac -strict experimental {}".format( + input_audio, input_video, str(directory / filename)) + subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT) os.remove(directory / video) os.remove(directory / audio) diff --git a/src/downloaders/youtube.py b/src/downloaders/youtube.py index a5c7922..0b0f0e2 100644 --- a/src/downloaders/youtube.py +++ b/src/downloaders/youtube.py @@ -1,12 +1,13 @@ import os -import youtube_dl import sys -from src.downloaders.downloaderUtils import createHash +import youtube_dl +from src.downloaders.downloaderUtils import createHash +from src.errors import FileAlreadyExistsError from src.utils import GLOBAL from src.utils import printToFile as print -from src.errors import FileAlreadyExistsError + class Youtube: @@ -35,19 +36,19 @@ class Youtube: if GLOBAL.arguments.no_dupes: try: - fileHash = createHash(location) + file_hash = createHash(location) except FileNotFoundError: return None - if fileHash in GLOBAL.downloadedPosts(): + if file_hash in GLOBAL.downloadedPosts(): os.remove(location) raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(fileHash) + GLOBAL.downloadedPosts.add(file_hash) @staticmethod def _hook(d): if d['status'] == 'finished': return print("Downloaded") - downloadedMbs = int(d['downloaded_bytes'] * (10**(-6))) - fileSize = int(d['total_bytes'] * (10**(-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs, fileSize)) + downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6))) + file_size = int(d['total_bytes'] * (10**(-6))) + sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size)) sys.stdout.flush() diff --git a/src/errors.py b/src/errors.py index 39d90be..7bf47b9 100644 --- a/src/errors.py +++ b/src/errors.py @@ -100,9 +100,6 @@ class InvalidSortingType(Exception): pass -class FileNotFoundError(Exception): - pass - class NoSuitablePost(Exception): pass diff --git a/src/jsonHelper.py b/src/jsonHelper.py index 11508b4..3f990f0 100644 --- a/src/jsonHelper.py +++ b/src/jsonHelper.py @@ -6,47 +6,43 @@ from src.errors import InvalidJSONFile class JsonFile: """ Write and read JSON files - Use add(self,toBeAdded) to add to files - Use delete(self,*deletedKeys) to delete keys """ - FILEDIR = "" + file_dir = "" - def __init__(self, FILEDIR): - self.FILEDIR = FILEDIR - if not path.exists(self.FILEDIR): + def __init__(self, file_dir): + self.file_dir = file_dir + if not path.exists(self.file_dir): self.__writeToFile({}, create=True) def read(self): try: - with open(self.FILEDIR, 'r') as f: + with open(self.file_dir, 'r') as f: return json.load(f) except json.decoder.JSONDecodeError: - raise InvalidJSONFile(f"{self.FILEDIR} cannot be read") + raise InvalidJSONFile(f"{self.file_dir} cannot be read") - def add(self, toBeAdded, sub=None): + def add(self, to_be_added, sub=None): """Takes a dictionary and merges it with json file. It uses new key's value if a key already exists. Returns the new content as a dictionary. """ - data = self.read() if sub: - data[sub] = {**data[sub], **toBeAdded} + data[sub] = {**data[sub], **to_be_added} else: - data = {**data, **toBeAdded} + data = {**data, **to_be_added} self.__writeToFile(data) return self.read() - def delete(self, *deleteKeys): + def delete(self, *delete_keys): """Delete given keys from JSON file. Returns the new content as a dictionary. """ - data = self.read() - for deleteKey in deleteKeys: + for deleteKey in delete_keys: if deleteKey in data: del data[deleteKey] found = True @@ -56,6 +52,6 @@ class JsonFile: def __writeToFile(self, content, create=False): if not create: - remove(self.FILEDIR) - with open(self.FILEDIR, 'w') as f: + remove(self.file_dir) + with open(self.file_dir, 'w') as f: json.dump(content, f, indent=4) diff --git a/src/parser.py b/src/parser.py index b48ea6d..1da6436 100644 --- a/src/parser.py +++ b/src/parser.py @@ -6,142 +6,136 @@ except ModuleNotFoundError: from errors import InvalidRedditLink -def QueryParser(PassedQueries, index): - ExtractedQueries = {} +def QueryParser(passed_queries, index): + extracted_queries = {} - QuestionMarkIndex = PassedQueries.index("?") - Header = PassedQueries[:QuestionMarkIndex] - ExtractedQueries["HEADER"] = Header - Queries = PassedQueries[QuestionMarkIndex + 1:] + question_mark_index = passed_queries.index("?") + header = passed_queries[:question_mark_index] + extracted_queries["HEADER"] = header + queries = passed_queries[question_mark_index + 1:] - ParsedQueries = Queries.split("&") + parsed_queries = queries.split("&") - for Query in ParsedQueries: - Query = Query.split("=") - ExtractedQueries[Query[0]] = Query[1] + for query in parsed_queries: + query = query.split("=") + extracted_queries[query[0]] = query[1] - if ExtractedQueries["HEADER"] == "search": - ExtractedQueries["q"] = ExtractedQueries["q"].replace("%20", " ") + if extracted_queries["HEADER"] == "search": + extracted_queries["q"] = extracted_queries["q"].replace("%20", " ") - return ExtractedQueries + return extracted_queries -def LinkParser(LINK): - RESULT = {} - ShortLink = False +def LinkParser(link): + result = {} + short_link = False - if "reddit.com" not in LINK: + if "reddit.com" not in link: raise InvalidRedditLink("Invalid reddit link") - SplittedLink = LINK.split("/") + splitted_link = link.split("/") - if SplittedLink[0] == "https:" or SplittedLink[0] == "http:": - SplittedLink = SplittedLink[2:] + if splitted_link[0] == "https:" or splitted_link[0] == "http:": + splitted_link = splitted_link[2:] try: - if (SplittedLink[-2].endswith("reddit.com") and - SplittedLink[-1] == "") or \ - SplittedLink[-1].endswith("reddit.com"): + if (splitted_link[-2].endswith("reddit.com") and + splitted_link[-1] == "") or splitted_link[-1].endswith("reddit.com"): - RESULT["sort"] = "best" - return RESULT + result["sort"] = "best" + return result except IndexError: - if SplittedLink[0].endswith("reddit.com"): - RESULT["sort"] = "best" - return RESULT + if splitted_link[0].endswith("reddit.com"): + result["sort"] = "best" + return result - if "redd.it" in SplittedLink: - ShortLink = True + if "redd.it" in splitted_link: + short_link = True - if SplittedLink[0].endswith("reddit.com"): - SplittedLink = SplittedLink[1:] + if splitted_link[0].endswith("reddit.com"): + splitted_link = splitted_link[1:] - if "comments" in SplittedLink: - RESULT = {"post": LINK} - return RESULT + if "comments" in splitted_link: + result = {"post": link} + return result - if "me" in SplittedLink or \ - "u" in SplittedLink or \ - "user" in SplittedLink or \ - "r" in SplittedLink or \ - "m" in SplittedLink: + elif "me" in splitted_link or \ + "u" in splitted_link or \ + "user" in splitted_link or \ + "r" in splitted_link or \ + "m" in splitted_link: - if "r" in SplittedLink: - RESULT["subreddit"] = SplittedLink[SplittedLink.index("r") + 1] + if "r" in splitted_link: + result["subreddit"] = splitted_link[splitted_link.index("r") + 1] - elif "m" in SplittedLink: - RESULT["multireddit"] = SplittedLink[SplittedLink.index("m") + 1] - RESULT["user"] = SplittedLink[SplittedLink.index("m") - 1] + elif "m" in splitted_link: + result["multireddit"] = splitted_link[splitted_link.index("m") + 1] + result["user"] = splitted_link[splitted_link.index("m") - 1] else: - for index in range(len(SplittedLink)): - if SplittedLink[index] == "u" or \ - SplittedLink[index] == "user": + for index in range(len(splitted_link)): + if splitted_link[index] == "u" or splitted_link[index] == "user": + result["user"] = splitted_link[index + 1] - RESULT["user"] = SplittedLink[index + 1] + elif splitted_link[index] == "me": + result["user"] = "me" - elif SplittedLink[index] == "me": - RESULT["user"] = "me" - - for index in range(len(SplittedLink)): - if SplittedLink[index] in [ + for index in range(len(splitted_link)): + if splitted_link[index] in [ "hot", "top", "new", "controversial", "rising" ]: - RESULT["sort"] = SplittedLink[index] + result["sort"] = splitted_link[index] if index == 0: - RESULT["subreddit"] = "frontpage" + result["subreddit"] = "frontpage" - elif SplittedLink[index] in ["submitted", "saved", "posts", "upvoted"]: - if SplittedLink[index] == "submitted" or \ - SplittedLink[index] == "posts": - RESULT["submitted"] = {} + elif splitted_link[index] in ["submitted", "saved", "posts", "upvoted"]: + if splitted_link[index] == "submitted" or splitted_link[index] == "posts": + result["submitted"] = {} - elif SplittedLink[index] == "saved": - RESULT["saved"] = True + elif splitted_link[index] == "saved": + result["saved"] = True - elif SplittedLink[index] == "upvoted": - RESULT["upvoted"] = True + elif splitted_link[index] == "upvoted": + result["upvoted"] = True - elif "?" in SplittedLink[index]: - ParsedQuery = QueryParser(SplittedLink[index], index) - if ParsedQuery["HEADER"] == "search": - del ParsedQuery["HEADER"] - RESULT["search"] = ParsedQuery + elif "?" in splitted_link[index]: + parsed_query = QueryParser(splitted_link[index], index) + if parsed_query["HEADER"] == "search": + del parsed_query["HEADER"] + result["search"] = parsed_query - elif ParsedQuery["HEADER"] == "submitted" or \ - ParsedQuery["HEADER"] == "posts": - del ParsedQuery["HEADER"] - RESULT["submitted"] = ParsedQuery + elif parsed_query["HEADER"] == "submitted" or \ + parsed_query["HEADER"] == "posts": + del parsed_query["HEADER"] + result["submitted"] = parsed_query else: - del ParsedQuery["HEADER"] - RESULT["queries"] = ParsedQuery + del parsed_query["HEADER"] + result["queries"] = parsed_query - if not ("upvoted" in RESULT or - "saved" in RESULT or - "submitted" in RESULT or - "multireddit" in RESULT) and \ - "user" in RESULT: - RESULT["submitted"] = {} + if not ("upvoted" in result or + "saved" in result or + "submitted" in result or + "multireddit" in result) and "user" in result: + result["submitted"] = {} - return RESULT + return result -def LinkDesigner(LINK): - - attributes = LinkParser(LINK) - MODE = {} +def LinkDesigner(link): + attributes = LinkParser(link) + mode = {} if "post" in attributes: - MODE["post"] = attributes["post"] - MODE["sort"] = "" - MODE["time"] = "" - return MODE + mode["post"] = attributes["post"] + mode["sort"] = "" + mode["time"] = "" + return mode - if "search" in attributes: - MODE["search"] = attributes["search"]["q"] + elif "search" in attributes: + mode["search"] = attributes["search"]["q"] if "restrict_sr" in attributes["search"]: @@ -150,91 +144,90 @@ def LinkDesigner(LINK): attributes["search"]["restrict_sr"] == ""): if "subreddit" in attributes: - MODE["subreddit"] = attributes["subreddit"] + mode["subreddit"] = attributes["subreddit"] elif "multireddit" in attributes: - MODE["multreddit"] = attributes["multireddit"] - MODE["user"] = attributes["user"] + mode["multreddit"] = attributes["multireddit"] + mode["user"] = attributes["user"] else: - MODE["subreddit"] = "all" + mode["subreddit"] = "all" else: - MODE["subreddit"] = "all" + mode["subreddit"] = "all" if "t" in attributes["search"]: - MODE["time"] = attributes["search"]["t"] + mode["time"] = attributes["search"]["t"] else: - MODE["time"] = "all" + mode["time"] = "all" if "sort" in attributes["search"]: - MODE["sort"] = attributes["search"]["sort"] + mode["sort"] = attributes["search"]["sort"] else: - MODE["sort"] = "relevance" + mode["sort"] = "relevance" if "include_over_18" in attributes["search"]: - if attributes["search"]["include_over_18"] == 1 or \ - attributes["search"]["include_over_18"] == "on": - MODE["nsfw"] = True + if attributes["search"]["include_over_18"] == 1 or attributes["search"]["include_over_18"] == "on": + mode["nsfw"] = True else: - MODE["nsfw"] = False + mode["nsfw"] = False else: if "queries" in attributes: - if not ("submitted" in attributes or - "posts" in attributes): + if not ("submitted" in attributes or "posts" in attributes): if "t" in attributes["queries"]: - MODE["time"] = attributes["queries"]["t"] + mode["time"] = attributes["queries"]["t"] else: - MODE["time"] = "day" + mode["time"] = "day" else: if "t" in attributes["queries"]: - MODE["time"] = attributes["queries"]["t"] + mode["time"] = attributes["queries"]["t"] else: - MODE["time"] = "all" + mode["time"] = "all" if "sort" in attributes["queries"]: - MODE["sort"] = attributes["queries"]["sort"] + mode["sort"] = attributes["queries"]["sort"] else: - MODE["sort"] = "new" + mode["sort"] = "new" else: - MODE["time"] = "day" + mode["time"] = "day" if "subreddit" in attributes and "search" not in attributes: - MODE["subreddit"] = attributes["subreddit"] + mode["subreddit"] = attributes["subreddit"] elif "user" in attributes and "search" not in attributes: - MODE["user"] = attributes["user"] + mode["user"] = attributes["user"] if "submitted" in attributes: - MODE["submitted"] = True + mode["submitted"] = True if "sort" in attributes["submitted"]: - MODE["sort"] = attributes["submitted"]["sort"] - elif "sort" in MODE: + mode["sort"] = attributes["submitted"]["sort"] + elif "sort" in mode: pass else: - MODE["sort"] = "new" + mode["sort"] = "new" if "t" in attributes["submitted"]: - MODE["time"] = attributes["submitted"]["t"] + mode["time"] = attributes["submitted"]["t"] else: - MODE["time"] = "all" + mode["time"] = "all" elif "saved" in attributes: - MODE["saved"] = True + mode["saved"] = True elif "upvoted" in attributes: - MODE["upvoted"] = True + mode["upvoted"] = True elif "multireddit" in attributes: - MODE["multireddit"] = attributes["multireddit"] + mode["multireddit"] = attributes["multireddit"] if "sort" in attributes: - MODE["sort"] = attributes["sort"] - elif "sort" in MODE: + mode["sort"] = attributes["sort"] + elif "sort" in mode: pass else: - MODE["sort"] = "hot" + mode["sort"] = "hot" + + return mode - return MODE if __name__ == "__main__": diff --git a/src/programMode.py b/src/programMode.py index 963e665..5fdd0ba 100644 --- a/src/programMode.py +++ b/src/programMode.py @@ -1,7 +1,9 @@ -from src.errors import SearchModeError, RedditorNameError, ProgramModeError, InvalidSortingType -from src.parser import LinkDesigner -from pathlib import Path import sys +from pathlib import Path + +from src.errors import InvalidSortingType, ProgramModeError, RedditorNameError, SearchModeError +from src.parser import LinkDesigner + class ProgramMode: @@ -10,213 +12,183 @@ class ProgramMode: self.arguments = arguments def generate(self): - try: self._validateProgramMode() except ProgramModeError: self._promptUser() - programMode = {} + program_mode = {} if self.arguments.user is not None: - programMode["user"] = self.arguments.user + program_mode["user"] = self.arguments.user if self.arguments.search is not None: - programMode["search"] = self.arguments.search + program_mode["search"] = self.arguments.search if self.arguments.sort == "hot" or \ self.arguments.sort == "controversial" or \ self.arguments.sort == "rising": self.arguments.sort = "relevance" if self.arguments.sort is not None: - programMode["sort"] = self.arguments.sort + program_mode["sort"] = self.arguments.sort else: if self.arguments.submitted: - programMode["sort"] = "new" + program_mode["sort"] = "new" else: - programMode["sort"] = "hot" + program_mode["sort"] = "hot" if self.arguments.time is not None: - programMode["time"] = self.arguments.time + program_mode["time"] = self.arguments.time else: - programMode["time"] = "all" + program_mode["time"] = "all" if self.arguments.link is not None: - self.arguments.link = self.arguments.link.strip("\"") - programMode = LinkDesigner(self.arguments.link) + program_mode = LinkDesigner(self.arguments.link) if self.arguments.search is not None: - programMode["search"] = self.arguments.search + program_mode["search"] = self.arguments.search if self.arguments.sort is not None: - programMode["sort"] = self.arguments.sort + program_mode["sort"] = self.arguments.sort if self.arguments.time is not None: - programMode["time"] = self.arguments.time + program_mode["time"] = self.arguments.time elif self.arguments.subreddit is not None: if isinstance(self.arguments.subreddit, list): self.arguments.subreddit = "+".join(self.arguments.subreddit) - programMode["subreddit"] = self.arguments.subreddit + program_mode["subreddit"] = self.arguments.subreddit elif self.arguments.multireddit is not None: - programMode["multireddit"] = self.arguments.multireddit + program_mode["multireddit"] = self.arguments.multireddit elif self.arguments.saved is True: - programMode["saved"] = True + program_mode["saved"] = True elif self.arguments.upvoted is True: - programMode["upvoted"] = True + program_mode["upvoted"] = True elif self.arguments.submitted is not None: - programMode["submitted"] = True + program_mode["submitted"] = True if self.arguments.sort == "rising": raise InvalidSortingType("Invalid sorting type has given") - programMode["limit"] = self.arguments.limit + program_mode["limit"] = self.arguments.limit - return programMode + return program_mode @staticmethod def _chooseFrom(choices): print() - choicesByIndex = [str(x) for x in range(len(choices) + 1)] + choices_by_index = list(str(x) for x in range(len(choices) + 1)) for i in range(len(choices)): - print("{indent}[{order}] {mode}".format( - indent=" " * 4, order=i + 1, mode=choices[i] - )) + print("{indent}[{order}] {mode}".format(indent=" " * 4, order=i + 1, mode=choices[i])) print(" " * 4 + "[0] exit\n") choice = input("> ") - while not choice.lower() in choices + choicesByIndex + ["exit"]: + while not choice.lower() in choices + choices_by_index + ["exit"]: print("Invalid input\n") input("> ") if choice == "0" or choice == "exit": sys.exit() - elif choice in choicesByIndex: + elif choice in choices_by_index: return choices[int(choice) - 1] else: return choice def _promptUser(self): print("select program mode:") - programModes = [ - "search", "subreddit", "multireddit", - "submitted", "upvoted", "saved", "log" - ] - programMode = self._chooseFrom(programModes) + program_modes = ["search", "subreddit", "multireddit", "submitted", "upvoted", "saved", "log"] + program_mode = self._chooseFrom(program_modes) - if programMode == "search": + if program_mode == "search": self.arguments.search = input("\nquery: ") self.arguments.subreddit = input("\nsubreddit: ") print("\nselect sort type:") - sortTypes = [ - "relevance", "top", "new" - ] - sortType = self._chooseFrom(sortTypes) - self.arguments.sort = sortType + sort_types = ["relevance", "top", "new"] + sort_type = self._chooseFrom(sort_types) + self.arguments.sort = sort_type print("\nselect time filter:") - timeFilters = [ - "hour", "day", "week", "month", "year", "all" - ] - timeFilter = self._chooseFrom(timeFilters) - self.arguments.time = timeFilter + time_filters = ["hour", "day", "week", "month", "year", "all"] + time_filter = self._chooseFrom(time_filters) + self.arguments.time = time_filter - if programMode == "subreddit": - - subredditInput = input( - "(type frontpage for all subscribed subreddits,\n" - " use plus to seperate multi subreddits:" - " pics+funny+me_irl etc.)\n\n" - "subreddit: ") - self.arguments.subreddit = subredditInput - - # while not (subredditInput == "" or subredditInput.lower() == "frontpage"): - # subredditInput = input("subreddit: ") - # self.arguments.subreddit += "+" + subredditInput + if program_mode == "subreddit": + subreddit_input = input("(type frontpage for all subscribed subreddits,\n" + " use plus to seperate multi subreddits:" + " pics+funny+me_irl etc.)\n\n" + "subreddit: ") + self.arguments.subreddit = subreddit_input if " " in self.arguments.subreddit: self.arguments.subreddit = "+".join( self.arguments.subreddit.split()) # DELETE THE PLUS (+) AT THE END - if not subredditInput.lower() == "frontpage" \ - and self.arguments.subreddit[-1] == "+": + if not subreddit_input.lower() == "frontpage" and self.arguments.subreddit[-1] == "+": self.arguments.subreddit = self.arguments.subreddit[:-1] print("\nselect sort type:") - sortTypes = [ - "hot", "top", "new", "rising", "controversial" - ] - sortType = self._chooseFrom(sortTypes) - self.arguments.sort = sortType + sort_types = ["hot", "top", "new", "rising", "controversial"] + sort_type = self._chooseFrom(sort_types) + self.arguments.sort = sort_type - if sortType in ["top", "controversial"]: + if sort_type in ["top", "controversial"]: print("\nselect time filter:") - timeFilters = [ - "hour", "day", "week", "month", "year", "all" - ] - timeFilter = self._chooseFrom(timeFilters) - self.arguments.time = timeFilter + time_filters = ["hour", "day", "week", "month", "year", "all"] + time_filter = self._chooseFrom(time_filters) + self.arguments.time = time_filter else: self.arguments.time = "all" - elif programMode == "multireddit": + elif program_mode == "multireddit": self.arguments.user = input("\nmultireddit owner: ") self.arguments.multireddit = input("\nmultireddit: ") print("\nselect sort type:") - sortTypes = [ - "hot", "top", "new", "rising", "controversial" - ] - sortType = self._chooseFrom(sortTypes) - self.arguments.sort = sortType + sort_types = ["hot", "top", "new", "rising", "controversial"] + sort_type = self._chooseFrom(sort_types) + self.arguments.sort = sort_type - if sortType in ["top", "controversial"]: + if sort_type in ["top", "controversial"]: print("\nselect time filter:") - timeFilters = [ - "hour", "day", "week", "month", "year", "all" - ] - timeFilter = self._chooseFrom(timeFilters) - self.arguments.time = timeFilter + time_filters = ["hour", "day", "week", "month", "year", "all"] + time_filter = self._chooseFrom(time_filters) + self.arguments.time = time_filter else: self.arguments.time = "all" - elif programMode == "submitted": + elif program_mode == "submitted": self.arguments.submitted = True self.arguments.user = input("\nredditor: ") print("\nselect sort type:") - sortTypes = [ - "hot", "top", "new", "controversial" - ] - sortType = self._chooseFrom(sortTypes) - self.arguments.sort = sortType + sort_types = ["hot", "top", "new", "controversial"] + sort_type = self._chooseFrom(sort_types) + self.arguments.sort = sort_type - if sortType == "top": + if sort_type == "top": print("\nselect time filter:") - timeFilters = [ - "hour", "day", "week", "month", "year", "all" - ] - timeFilter = self._chooseFrom(timeFilters) - self.arguments.time = timeFilter + time_filters = ["hour", "day", "week", "month", "year", "all"] + time_filter = self._chooseFrom(time_filters) + self.arguments.time = time_filter else: self.arguments.time = "all" - elif programMode == "upvoted": + elif program_mode == "upvoted": self.arguments.upvoted = True self.arguments.user = input("\nredditor: ") - elif programMode == "saved": + elif program_mode == "saved": self.arguments.saved = True - elif programMode == "log": + elif program_mode == "log": while True: self.arguments.log = input("\nlog file directory:") if Path(self.arguments.log).is_file(): @@ -234,7 +206,6 @@ class ProgramMode: """Check if command-line self.arguments are given correcly, if not, raise errors """ - if self.arguments.user is None: user = 0 else: @@ -242,21 +213,13 @@ class ProgramMode: search = 1 if self.arguments.search else 0 - modes = [ - "saved", - "subreddit", - "submitted", - "log", - "link", - "upvoted", - "multireddit"] + modes = ["saved", "subreddit", "submitted", "log", "link", "upvoted", "multireddit"] - values = { - x: 0 if getattr(self.arguments, x) is None or - getattr(self.arguments, x) is False - else 1 - for x in modes - } + values = {x: 0 if getattr(self.arguments, x) is None or + getattr(self.arguments, x) is False + else 1 + for x in modes + } if not sum(values[x] for x in values) == 1: raise ProgramModeError("Invalid program mode") diff --git a/src/reddit.py b/src/reddit.py index 9dc374c..a953c87 100644 --- a/src/reddit.py +++ b/src/reddit.py @@ -1,12 +1,14 @@ -import praw import random import socket import webbrowser + +import praw from prawcore.exceptions import ResponseException -from src.utils import GLOBAL +from src.errors import RedditLoginFailed from src.jsonHelper import JsonFile -from src. errors import RedditLoginFailed +from src.utils import GLOBAL + class Reddit: @@ -23,7 +25,6 @@ class Reddit: } def begin(self): - if self.refresh_token: self.arguments["refresh_token"] = self.refresh_token self.redditInstance = praw.Reddit(**self.arguments) @@ -41,11 +42,8 @@ class Reddit: self.redditInstance = praw.Reddit(**self.arguments) reddit, refresh_token = self.getRefreshToken(*self.SCOPES) - JsonFile(GLOBAL.configDirectory).add({ - "reddit_username": str(reddit.user.me()), - "reddit": refresh_token - }, "credentials") - + JsonFile(GLOBAL.configDirectory).add({"reddit_username": str( + reddit.user.me()), "reddit": refresh_token}, "credentials") return self.redditInstance def recieve_connection(self): @@ -63,33 +61,23 @@ class Reddit: @staticmethod def send_message(client, message): """Send message to client and close the connection.""" - client.send( - 'HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8') - ) + client.send('HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8')) client.close() def getRefreshToken(self, *scopes): state = str(random.randint(0, 65000)) url = self.redditInstance.auth.url(scopes, state, 'permanent') print("---Setting up the Reddit API---\n") - print( - "Go to this URL and login to reddit:\n", - url, - sep="\n", - end="\n\n") + print("Go to this URL and login to reddit:\n", url, sep="\n", end="\n\n") webbrowser.open(url, new=2) client = self.recieve_connection() data = client.recv(1024).decode('utf-8') str(data) param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&') - params = dict([token.split('=') - for token in param_tokens]) + params = {key: value for (key, value) in [token.split('=') for token in param_tokens]} if state != params['state']: - self.send_message( - client, 'State mismatch. Expected: {} Received: {}' - .format(state, params['state']) - ) + self.send_message(client, 'State mismatch. Expected: {} Received: {}'.format(state, params['state'])) raise RedditLoginFailed if 'error' in params: self.send_message(client, params['error']) @@ -101,4 +89,4 @@ class Reddit: "alert(\"You can go back to terminal window now.\");" "" ) - return (self.redditInstance, refresh_token) + return self.redditInstance, refresh_token diff --git a/src/searcher.py b/src/searcher.py index dbb8835..720ba38 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,116 +1,116 @@ import sys import time import urllib.request -from prawcore.exceptions import NotFound, Forbidden +from urllib.error import HTTPError +from prawcore.exceptions import Forbidden, NotFound + +from src.errors import (InsufficientPermission, InvalidSortingType, MultiredditNotFound, NoMatchingSubmissionFound, + NoPrawSupport) from src.reddit import Reddit from src.utils import GLOBAL, createLogFile, printToFile -from src.errors import (NoMatchingSubmissionFound, NoPrawSupport, - MultiredditNotFound, - InvalidSortingType, InsufficientPermission) print = printToFile -def getPosts(programMode): +def getPosts(program_mode): """Call PRAW regarding to arguments and pass it to extractDetails. Return what extractDetails has returned. """ - reddit = Reddit(GLOBAL.config["credentials"]["reddit"]).begin() - if programMode["sort"] == "best": + if program_mode["sort"] == "best": raise NoPrawSupport("PRAW does not support that") - if "subreddit" in programMode: - if "search" in programMode: - if programMode["subreddit"] == "frontpage": - programMode["subreddit"] = "all" + if "subreddit" in program_mode: + if "search" in program_mode: + if program_mode["subreddit"] == "frontpage": + program_mode["subreddit"] = "all" - if "user" in programMode: - if programMode["user"] == "me": - programMode["user"] = str(reddit.user.me()) + if "user" in program_mode: + if program_mode["user"] == "me": + program_mode["user"] = str(reddit.user.me()) - if "search" not in programMode: - if programMode["sort"] == "top" or programMode["sort"] == "controversial": - keyword_params = { - "time_filter": programMode["time"], - "limit": programMode["limit"] - } + if "search" not in program_mode: + if program_mode["sort"] == "top" or program_mode["sort"] == "controversial": + keyword_params = {"time_filter": program_mode["time"], "limit": program_mode["limit"]} # OTHER SORT TYPES DON'T TAKE TIME_FILTER else: - keyword_params = { - "limit": programMode["limit"] - } + keyword_params = {"limit": program_mode["limit"]} else: - keyword_params = { - "time_filter": programMode["time"], - "limit": programMode["limit"] - } + keyword_params = {"time_filter": program_mode["time"], "limit": program_mode["limit"]} - if "search" in programMode: - if programMode["sort"] in ["hot", "rising", "controversial"]: + if "search" in program_mode: + if program_mode["sort"] in ["hot", "rising", "controversial"]: raise InvalidSortingType("Invalid sorting type has given") - if "subreddit" in programMode: + if "subreddit" in program_mode: print( "search for \"{search}\" in\n" "subreddit: {subreddit}\nsort: {sort}\n" "time: {time}\nlimit: {limit}\n".format( - search=programMode["search"], - limit=programMode["limit"], - sort=programMode["sort"], - subreddit=programMode["subreddit"], - time=programMode["time"] - ).upper(), noPrint=True + search=program_mode["search"], + limit=program_mode["limit"], + sort=program_mode["sort"], + subreddit=program_mode["subreddit"], + time=program_mode["time"] + ).upper(), no_print=True ) return extractDetails( - reddit.subreddit(programMode["subreddit"]).search( - programMode["search"], - limit=programMode["limit"], - sort=programMode["sort"], - time_filter=programMode["time"] + reddit.subreddit(program_mode["subreddit"]).search( + program_mode["search"], + limit=program_mode["limit"], + sort=program_mode["sort"], + time_filter=program_mode["time"] ) ) - if "multireddit" in programMode: + elif "multireddit" in program_mode: raise NoPrawSupport("PRAW does not support that") - if "user" in programMode: + elif "user" in program_mode: raise NoPrawSupport("PRAW does not support that") - if "saved" in programMode: + elif "saved" in program_mode: raise ("Reddit does not support that") - if programMode["sort"] == "relevance": + if program_mode["sort"] == "relevance": raise InvalidSortingType("Invalid sorting type has given") - if "saved" in programMode: - print( - "saved posts\nuser:{username}\nlimit={limit}\n".format( - username=reddit.user.me(), - limit=programMode["limit"] - ).upper(), noPrint=True + if "saved" in program_mode: + print("saved posts\nuser:{username}\nlimit={limit}\n".format( + username=reddit.user.me(), + limit=program_mode["limit"]).upper(), + no_print=True ) - return extractDetails( - reddit.user.me().saved( - limit=programMode["limit"])) + return extractDetails(reddit.user.me().saved(limit=program_mode["limit"])) - if "subreddit" in programMode: - - if programMode["subreddit"] == "frontpage": + if "subreddit" in program_mode: + if program_mode["subreddit"] == "frontpage": print( "subreddit: {subreddit}\nsort: {sort}\n" "time: {time}\nlimit: {limit}\n".format( - limit=programMode["limit"], - sort=programMode["sort"], - subreddit=programMode["subreddit"], - time=programMode["time"] - ).upper(), noPrint=True + limit=program_mode["limit"], + sort=program_mode["sort"], + subreddit=program_mode["subreddit"], + time=program_mode["time"]).upper(), + no_print=True + ) + return extractDetails(getattr(reddit.front, program_mode["sort"])(**keyword_params)) + + else: + print( + "subreddit: {subreddit}\nsort: {sort}\n" + "time: {time}\nlimit: {limit}\n".format( + limit=program_mode["limit"], + sort=program_mode["sort"], + subreddit=program_mode["subreddit"], + time=program_mode["time"]).upper(), + no_print=True ) return extractDetails( - getattr(reddit.front, programMode["sort"])(**keyword_params) + getattr(reddit.subreddit(program_mode["subreddit"]), program_mode["sort"])(**keyword_params) ) print( "subreddit: {subreddit}\nsort: {sort}\n" @@ -127,87 +127,75 @@ def getPosts(programMode): )(**keyword_params) ) - if "multireddit" in programMode: + elif "multireddit" in program_mode: print( "user: {user}\n" "multireddit: {multireddit}\nsort: {sort}\n" "time: {time}\nlimit: {limit}\n".format( - user=programMode["user"], - limit=programMode["limit"], - sort=programMode["sort"], - multireddit=programMode["multireddit"], - time=programMode["time"] - ).upper(), noPrint=True + user=program_mode["user"], + limit=program_mode["limit"], + sort=program_mode["sort"], + multireddit=program_mode["multireddit"], + time=program_mode["time"]).upper(), + no_print=True ) try: return extractDetails( - getattr( - reddit.multireddit( - programMode["user"], programMode["multireddit"] - ), programMode["sort"] - )(**keyword_params) + getattr(reddit.multireddit(program_mode["user"], program_mode["multireddit"]), + program_mode["sort"] + )(**keyword_params) ) except NotFound: raise MultiredditNotFound("Multireddit not found") - elif "submitted" in programMode: + elif "submitted" in program_mode: print( "submitted posts of {user}\nsort: {sort}\n" "time: {time}\nlimit: {limit}\n".format( - limit=programMode["limit"], - sort=programMode["sort"], - user=programMode["user"], - time=programMode["time"] - ).upper(), noPrint=True + limit=program_mode["limit"], + sort=program_mode["sort"], + user=program_mode["user"], + time=program_mode["time"]).upper(), + no_print=True ) return extractDetails( - getattr( - reddit.redditor(programMode["user"] - ).submissions, programMode["sort"] - )(**keyword_params) + getattr(reddit.redditor(program_mode["user"]).submissions, program_mode["sort"])(**keyword_params) ) - elif "upvoted" in programMode: + elif "upvoted" in program_mode: print( "upvoted posts of {user}\nlimit: {limit}\n".format( - user=programMode["user"], - limit=programMode["limit"] - ).upper(), noPrint=True + user=program_mode["user"], + limit=program_mode["limit"]).upper(), + no_print=True ) try: - return extractDetails( - reddit.redditor(programMode["user"]).upvoted( - limit=programMode["limit"]) - ) + return extractDetails(reddit.redditor(program_mode["user"]).upvoted(limit=program_mode["limit"])) except Forbidden: raise InsufficientPermission( "You do not have permission to do that") - elif "post" in programMode: - print("post: {post}\n".format( - post=programMode["post"]).upper(), noPrint=True) - return extractDetails( - reddit.submission(url=programMode["post"]), SINGLE_POST=True - ) + elif "post" in program_mode: + print("post: {post}\n".format(post=program_mode["post"]).upper(), no_print=True) + return extractDetails(reddit.submission(url=program_mode["post"]), single_post=True) -def extractDetails(posts, SINGLE_POST=False): +def extractDetails(posts, single_post=False): """Check posts and decide if it can be downloaded. If so, create a dictionary with post details and append them to a list. Write all of posts to file. Return the list """ + post_list = [] + post_count = 1 - postList = [] - postCount = 1 - - allPosts = {} + all_posts = {} print("\nGETTING POSTS") - postsFile = createLogFile("POSTS") + posts_file = createLogFile("POSTS") - if SINGLE_POST: + if single_post: submission = posts - postCount += 1 + post_count += 1 try: details = {'POSTID': submission.id, 'TITLE': submission.title, @@ -217,12 +205,8 @@ def extractDetails(posts, SINGLE_POST=False): 'SUBREDDIT': submission.subreddit.display_name, 'UPVOTES': submission.score, 'FLAIR': submission.link_flair_text, - 'DATE': str(time.strftime( - "%Y-%m-%d_%H-%M", - time.localtime(submission.created_utc) - ))} - if 'gallery' in submission.url: - details['CONTENTURL'] = genLinksifGallery(submission.media_metadata) + 'DATE': str(time.strftime("%Y-%m-%d_%H-%M", time.localtime(submission.created_utc))) + } except AttributeError: pass @@ -232,18 +216,17 @@ def extractDetails(posts, SINGLE_POST=False): if result is not None: details = {**details, **result} - postList.append(details) - postsFile.add({postCount: details}) + post_list.append(details) + posts_file.add({post_count: details}) else: try: for submission in posts: - - if postCount % 100 == 0: + if post_count % 100 == 0: sys.stdout.write("• ") sys.stdout.flush() - if postCount % 1000 == 0: + if post_count % 1000 == 0: sys.stdout.write("\n" + " " * 14) sys.stdout.flush() @@ -256,12 +239,8 @@ def extractDetails(posts, SINGLE_POST=False): 'SUBREDDIT': submission.subreddit.display_name, 'UPVOTES': submission.score, 'FLAIR': submission.link_flair_text, - 'DATE': str(time.strftime( - "%Y-%m-%d_%H-%M", - time.localtime(submission.created_utc) - ))} - if 'gallery' in submission.url: - details['CONTENTURL'] = genLinksifGallery(submission.media_metadata) + 'DATE': str(time.strftime("%Y-%m-%d_%H-%M", time.localtime(submission.created_utc))) + } except AttributeError: continue @@ -274,52 +253,46 @@ def extractDetails(posts, SINGLE_POST=False): if result is not None: details = {**details, **result} - postList.append(details) + post_list.append(details) - allPosts[postCount] = details - postCount += 1 + all_posts[post_count] = details + post_count += 1 except KeyboardInterrupt: - print("\nKeyboardInterrupt", noPrint=True) + print("\nKeyboardInterrupt", no_print=True) - postsFile.add(allPosts) + posts_file.add(all_posts) - if len(postList) != 0: + if not len(post_list) == 0: print() - return postList - raise NoMatchingSubmissionFound("No matching submission was found") + return post_list + else: + raise NoMatchingSubmissionFound("No matching submission was found") def matchWithDownloader(submission): - - if 'gallery' in submission.url: - return{'TYPE':'gallery'} - - directLink = extractDirectLink(submission.url) - if directLink: - return {'TYPE': 'direct', - 'CONTENTURL': directLink} + direct_link = extractDirectLink(submission.url) + if direct_link: + return {'TYPE': 'direct', 'CONTENTURL': direct_link} if 'v.redd.it' in submission.domain: - bitrates = ["DASH_1080", "DASH_720", "DASH_600", - "DASH_480", "DASH_360", "DASH_240"] + bitrates = ["DASH_1080", "DASH_720", "DASH_600", "DASH_480", "DASH_360", "DASH_240"] for bitrate in bitrates: - videoURL = submission.url + "/" + bitrate + ".mp4" + video_url = submission.url + "/" + bitrate + ".mp4" try: - responseCode = urllib.request.urlopen(videoURL).getcode() + response_code = urllib.request.urlopen(video_url).getcode() except urllib.error.HTTPError: - responseCode = 0 + response_code = 0 - if responseCode == 200: - return {'TYPE': 'v.redd.it', 'CONTENTURL': videoURL} + if response_code == 200: + return {'TYPE': 'v.redd.it', 'CONTENTURL': video_url} if 'gfycat' in submission.domain: return {'TYPE': 'gfycat'} - if 'youtube' in submission.domain \ - and 'watch' in submission.url: + if 'youtube' in submission.domain and 'watch' in submission.url: return {'TYPE': 'youtube'} if 'youtu.be' in submission.domain: @@ -342,33 +315,25 @@ def matchWithDownloader(submission): if 'reddit.com/gallery' in submission.url: return {'TYPE': 'gallery'} - if submission.is_self and 'self' not in GLOBAL.arguments.skip: + elif submission.is_self and 'self' not in GLOBAL.arguments.skip: return {'TYPE': 'self', 'CONTENT': submission.selftext} -def extractDirectLink(URL): +def extractDirectLink(url): """Check if link is a direct image link. If so, return URL, if not, return False """ + image_types = ['jpg', 'jpeg', 'png', 'mp4', 'webm', 'gif'] + if url[-1] == "/": + url = url[:-1] - imageTypes = ['jpg', 'jpeg', 'png', 'mp4', 'webm', 'gif'] - if URL[-1] == "/": - URL = URL[:-1] + if "i.reddituploads.com" in url: + return url - if "i.reddituploads.com" in URL: - return URL - - for extension in imageTypes: - if extension == URL.split(".")[-1]: - return URL - - return None - -def genLinksifGallery(metadata): - galleryImgUrls = list() - if metadata is not None: - for key in metadata: - galleryImgUrls.append(metadata[key]['s']['u'].split('?')[0].replace('preview','i')) - return galleryImgUrls + for extension in image_types: + if extension == url.split(".")[-1]: + return url + else: + return None diff --git a/src/utils.py b/src/utils.py index 402db90..5debfe9 100644 --- a/src/utils.py +++ b/src/utils.py @@ -8,7 +8,6 @@ from src.jsonHelper import JsonFile class GLOBAL: """Declare global variables""" - RUN_TIME = "" config = {'imgur_client_id': None, 'imgur_client_secret': None} arguments = None @@ -17,54 +16,47 @@ class GLOBAL: configDirectory = "" reddit_client_id = "U-6gk4ZCh3IeNQ" reddit_client_secret = "7CZHY6AmKweZME5s50SfDGylaPg" - @staticmethod - def downloadedPosts(): return [] printVanilla = print - log_stream = None + @staticmethod + def downloadedPosts(): + return [] -def createLogFile(TITLE): + +def createLogFile(title): """Create a log file with given name inside a folder time stampt in its name and put given arguments inside \"HEADER\" key """ + folder_directory = GLOBAL.directory / "LOG_FILES" / GLOBAL.RUN_TIME - folderDirectory = GLOBAL.directory / "LOG_FILES" / GLOBAL.RUN_TIME + log_filename = title.upper() + '.json' - logFilename = TITLE.upper() + '.json' + if not path.exists(folder_directory): + makedirs(folder_directory) - if not path.exists(folderDirectory): - makedirs(folderDirectory) + file = JsonFile(folder_directory / Path(log_filename)) + header = " ".join(sys.argv) + file.add({"HEADER": header}) - FILE = JsonFile(folderDirectory / Path(logFilename)) - HEADER = " ".join(sys.argv) - FILE.add({"HEADER": HEADER}) - - return FILE + return file -def printToFile(*args, noPrint=False, **kwargs): +def printToFile(*args, no_print=False, **kwargs): """Print to both CONSOLE and CONSOLE LOG file in a folder time stampt in the name """ + folder_directory = GLOBAL.directory / Path("LOG_FILES") / Path(GLOBAL.RUN_TIME) - folderDirectory = GLOBAL.directory / \ - Path("LOG_FILES") / Path(GLOBAL.RUN_TIME) - - if not noPrint or \ - GLOBAL.arguments.verbose or \ - "file" in kwargs: - + if not no_print or GLOBAL.arguments.verbose or "file" in kwargs: print(*args, **kwargs) - if not path.exists(folderDirectory): - makedirs(folderDirectory) + if not path.exists(folder_directory): + makedirs(folder_directory) if "file" not in kwargs: - with io.open( - folderDirectory / "CONSOLE_LOG.txt", "a", encoding="utf-8" - ) as FILE: + with io.open(folder_directory / "CONSOLE_LOG.txt", "a", encoding="utf-8") as FILE: print(*args, file=FILE, **kwargs) @@ -73,19 +65,17 @@ def nameCorrector(string, reference=None): with underscore (_) and shorten it. Return the string """ - - LIMIT = 247 - - stringLength = len(string) + limit = 247 + string_length = len(string) if reference: - referenceLenght = len(reference) - totalLenght = referenceLenght + reference_length = len(reference) + total_lenght = reference_length else: - totalLenght = stringLength + total_lenght = string_length - if totalLenght > LIMIT: - limit = LIMIT - referenceLenght + if total_lenght > limit: + limit -= reference_length string = string[:limit - 1] string = string.replace(" ", "_") @@ -93,8 +83,7 @@ def nameCorrector(string, reference=None): if len(string.split('\n')) > 1: string = "".join(string.split('\n')) - BAD_CHARS = ['\\', '/', ':', '*', '?', '"', '<', - '>', '|', '#', '.', '@', '“', '’', '\'', '!'] - string = "".join([i if i not in BAD_CHARS else "_" for i in string]) + bad_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '#', '.', '@', '“', '’', '\'', '!'] + string = "".join([i if i not in bad_chars else "_" for i in string]) return string From 185335e60bd7ff7c19cadc04a055d0f382acc7cc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 6 Feb 2021 22:29:13 +1000 Subject: [PATCH 002/276] Add tentative typing --- script.py | 2 -- src/config.py | 6 +++--- src/downloaders/Direct.py | 3 ++- src/downloaders/Erome.py | 5 +++-- src/downloaders/Gfycat.py | 5 +++-- src/downloaders/Imgur.py | 16 +++++++++------- src/downloaders/downloaderUtils.py | 8 ++++---- src/downloaders/gallery.py | 7 ++++--- src/downloaders/gifDeliveryNetwork.py | 5 +++-- src/downloaders/redgifs.py | 5 +++-- src/downloaders/selfPost.py | 5 +++-- src/downloaders/vreddit.py | 10 ++++++++-- src/downloaders/youtube.py | 7 ++++--- src/jsonHelper.py | 16 ++++++++-------- src/parser.py | 12 +++++------- src/programMode.py | 7 ++++--- src/reddit.py | 11 +++++------ src/searcher.py | 10 ++++++---- src/store.py | 6 +++--- src/utils.py | 7 ++++--- 20 files changed, 84 insertions(+), 69 deletions(-) diff --git a/script.py b/script.py index 87800b9..0053b06 100644 --- a/script.py +++ b/script.py @@ -87,7 +87,6 @@ def isPostExists(post, directory): def downloadPost(submission, directory): - downloaders = { "imgur": Imgur, "gfycat": Gfycat, "erome": Erome, "direct": Direct, "self": SelfPost, "redgifs": Redgifs, "gifdeliverynetwork": GifDeliveryNetwork, @@ -101,7 +100,6 @@ def downloadPost(submission, directory): raise NoSuitablePost - def download(submissions): """Analyze list of submissions and call the right function to download each one, catch errors, update the log files diff --git a/src/config.py b/src/config.py index 3f9f17a..2daaa64 100644 --- a/src/config.py +++ b/src/config.py @@ -5,11 +5,11 @@ from src.utils import nameCorrector class Config: - def __init__(self, filename): + def __init__(self, filename: str): self.filename = filename self.file = JsonFile(self.filename) - def generate(self): + def generate(self) -> dict: self._validateCredentials() self._readCustomFileName() self._readCustomFolderPath() @@ -80,7 +80,7 @@ Existing default options:""", None if "options" not in self.file.read() else sel self.file.add({"options": options}) - def _readDefaultOptions(self, path=None): + def _readDefaultOptions(self): content = self.file.read() if "options" not in content: self.file.add({"options": ""}) diff --git a/src/downloaders/Direct.py b/src/downloaders/Direct.py index 44bbe61..7f89081 100644 --- a/src/downloaders/Direct.py +++ b/src/downloaders/Direct.py @@ -1,11 +1,12 @@ import os +import pathlib from src.downloaders.downloaderUtils import getExtension, getFile from src.utils import GLOBAL class Direct: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): post['EXTENSION'] = getExtension(post['CONTENTURL']) if not os.path.exists(directory): os.makedirs(directory) diff --git a/src/downloaders/Erome.py b/src/downloaders/Erome.py index 9283131..f52bef3 100644 --- a/src/downloaders/Erome.py +++ b/src/downloaders/Erome.py @@ -1,4 +1,5 @@ import os +import pathlib import urllib.error import urllib.request from html.parser import HTMLParser @@ -10,7 +11,7 @@ from src.utils import printToFile as print class Erome: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): try: images = self.getLinks(post['CONTENTURL']) except urllib.error.HTTPError: @@ -80,7 +81,7 @@ class Erome: elif how_many_downloaded + duplicates < images_length: raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") - def getLinks(self, url): + def getLinks(self, url: str) -> list[str]: content = [] line_number = None diff --git a/src/downloaders/Gfycat.py b/src/downloaders/Gfycat.py index 3cea2c2..7e06f51 100644 --- a/src/downloaders/Gfycat.py +++ b/src/downloaders/Gfycat.py @@ -8,11 +8,12 @@ from src.downloaders.downloaderUtils import getExtension, getFile from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork from src.errors import NotADownloadableLinkError from src.utils import GLOBAL +import pathlib class Gfycat: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: @@ -29,7 +30,7 @@ class Gfycat: getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod - def getLink(url): + def getLink(url: str) -> str: """Extract direct link to the video from page's source and return it """ diff --git a/src/downloaders/Imgur.py b/src/downloaders/Imgur.py index 239fc2d..32233c6 100644 --- a/src/downloaders/Imgur.py +++ b/src/downloaders/Imgur.py @@ -1,5 +1,7 @@ import json import os +import pathlib + import requests from src.downloaders.Direct import Direct @@ -14,13 +16,13 @@ class Imgur: imgur_image_domain = "https://i.imgur.com/" - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): link = post['CONTENTURL'] if link.endswith(".gifv"): link = link.replace(".gifv", ".mp4") Direct(directory, {**post, 'CONTENTURL': link}) - return None + return self.raw_data = self.getData(link) @@ -35,7 +37,7 @@ class Imgur: else: self.download(self.raw_data) - def downloadAlbum(self, images): + def downloadAlbum(self, images: dict): folder_name = GLOBAL.config['filename'].format(**self.post) folder_dir = self.directory / folder_name @@ -92,7 +94,7 @@ class Imgur: elif how_many_downloaded + duplicates < images_length: raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") - def download(self, image): + def download(self, image: dict): extension = self.validateExtension(image["ext"]) image_url = self.imgur_image_domain + image["hash"] + extension @@ -102,11 +104,11 @@ class Imgur: getFile(filename, short_filename, self.directory, image_url) @property - def isAlbum(self): + def isAlbum(self) -> bool: return "album_images" in self.raw_data @staticmethod - def getData(link): + def getData(link: str) -> dict: cookies = {"over18": "1", "postpagebeta": "0"} res = requests.get(link, cookies=cookies) if res.status_code != 200: @@ -135,7 +137,7 @@ class Imgur: return json.loads(data) @staticmethod - def validateExtension(string): + def validateExtension(string: str) -> str: possible_extensions = [".jpg", ".png", ".mp4", ".gif"] for extension in possible_extensions: diff --git a/src/downloaders/downloaderUtils.py b/src/downloaders/downloaderUtils.py index 110e971..c94f1d4 100644 --- a/src/downloaders/downloaderUtils.py +++ b/src/downloaders/downloaderUtils.py @@ -9,7 +9,7 @@ from src.utils import GLOBAL from src.utils import printToFile as print -def dlProgress(count, block_size, total_size): +def dlProgress(count: int, block_size: int, total_size: int): """Function for writing download progress to console """ download_mbs = int(count * block_size * (10 ** (-6))) @@ -18,7 +18,7 @@ def dlProgress(count, block_size, total_size): sys.stdout.flush() -def getExtension(link): +def getExtension(link: str): """Extract file extension from image link. If didn't find any, return '.jpg' """ @@ -34,7 +34,7 @@ def getExtension(link): return '.mp4' -def getFile(filename, short_filename, folder_dir, image_url, indent=0, silent=False): +def getFile(filename: str, short_filename: str, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False): formats = { "videos": [".mp4", ".webm"], "images": [".jpg", ".jpeg", ".png", ".bmp"], @@ -101,7 +101,7 @@ def getFile(filename, short_filename, folder_dir, image_url, indent=0, silent=Fa raise FailedToDownload -def createHash(filename): +def createHash(filename: str) -> str: hash_md5 = hashlib.md5() with open(filename, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): diff --git a/src/downloaders/gallery.py b/src/downloaders/gallery.py index d5cbac4..4f9a1c4 100644 --- a/src/downloaders/gallery.py +++ b/src/downloaders/gallery.py @@ -3,6 +3,7 @@ import os import urllib import requests +import pathlib from src.downloaders.downloaderUtils import getFile from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, @@ -12,7 +13,7 @@ from src.utils import printToFile as print class Gallery: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post): link = post['CONTENTURL'] self.raw_data = self.getData(link) @@ -36,7 +37,7 @@ class Gallery: self.downloadAlbum(images, count) @staticmethod - def getData(link): + def getData(link: str) -> dict: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", @@ -59,7 +60,7 @@ class Gallery: data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1]) return data - def downloadAlbum(self, images, count): + def downloadAlbum(self, images: dict, count: int): folder_name = GLOBAL.config['filename'].format(**self.post) folder_dir = self.directory / folder_name diff --git a/src/downloaders/gifDeliveryNetwork.py b/src/downloaders/gifDeliveryNetwork.py index a41b7ea..666fd9f 100644 --- a/src/downloaders/gifDeliveryNetwork.py +++ b/src/downloaders/gifDeliveryNetwork.py @@ -1,4 +1,5 @@ import os +import pathlib import urllib.request from bs4 import BeautifulSoup @@ -9,7 +10,7 @@ from src.utils import GLOBAL class GifDeliveryNetwork: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: @@ -26,7 +27,7 @@ class GifDeliveryNetwork: getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod - def getLink(url): + def getLink(url: str) -> str: """Extract direct link to the video from page's source and return it """ diff --git a/src/downloaders/redgifs.py b/src/downloaders/redgifs.py index b12c17d..a0af7b7 100644 --- a/src/downloaders/redgifs.py +++ b/src/downloaders/redgifs.py @@ -1,5 +1,6 @@ import json import os +import pathlib import urllib.request from bs4 import BeautifulSoup @@ -10,7 +11,7 @@ from src.utils import GLOBAL class Redgifs: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: @@ -27,7 +28,7 @@ class Redgifs: getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod - def getLink(url): + def getLink(url: str) -> str: """Extract direct link to the video from page's source and return it """ diff --git a/src/downloaders/selfPost.py b/src/downloaders/selfPost.py index bae8b88..32801c9 100644 --- a/src/downloaders/selfPost.py +++ b/src/downloaders/selfPost.py @@ -1,6 +1,7 @@ from src.utils import printToFile as print import io import os +import pathlib from pathlib import Path from src.errors import FileAlreadyExistsError, TypeInSkip @@ -11,7 +12,7 @@ VanillaPrint = print class SelfPost: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): if "self" in GLOBAL.arguments.skip: raise TypeInSkip @@ -36,7 +37,7 @@ class SelfPost: self.writeToFile(file_dir, post) @staticmethod - def writeToFile(directory, post): + def writeToFile(directory: pathlib.Path, post: dict): """Self posts are formatted here""" content = ("## [" + post["TITLE"] diff --git a/src/downloaders/vreddit.py b/src/downloaders/vreddit.py index 16f5296..bf5e98a 100644 --- a/src/downloaders/vreddit.py +++ b/src/downloaders/vreddit.py @@ -1,4 +1,5 @@ import os +import pathlib import subprocess from src.downloaders.downloaderUtils import getFile @@ -7,7 +8,7 @@ from src.utils import printToFile as print class VReddit: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): extension = ".mp4" if not os.path.exists(directory): os.makedirs(directory) @@ -39,7 +40,12 @@ class VReddit: os.rename(directory / video_name, directory / filename) @staticmethod - def _mergeAudio(video, audio, filename, short_filename, directory): + def _mergeAudio( + video: pathlib.Path, + audio: pathlib.Path, + filename: pathlib.Path, + short_filename, + directory: pathlib.Path): input_video = str(directory / video) input_audio = str(directory / audio) diff --git a/src/downloaders/youtube.py b/src/downloaders/youtube.py index 0b0f0e2..9df78b1 100644 --- a/src/downloaders/youtube.py +++ b/src/downloaders/youtube.py @@ -1,4 +1,5 @@ import os +import pathlib import sys import youtube_dl @@ -11,7 +12,7 @@ from src.utils import printToFile as print class Youtube: - def __init__(self, directory, post): + def __init__(self, directory: pathlib.Path, post: dict): if not os.path.exists(directory): os.makedirs(directory) @@ -20,7 +21,7 @@ class Youtube: self.download(filename, directory, post['CONTENTURL']) - def download(self, filename, directory, url): + def download(self, filename: str, directory: pathlib.Path, url: str): ydl_opts = { "format": "best", "outtmpl": str(directory / (filename + ".%(ext)s")), @@ -36,7 +37,7 @@ class Youtube: if GLOBAL.arguments.no_dupes: try: - file_hash = createHash(location) + file_hash = createHash(str(location)) except FileNotFoundError: return None if file_hash in GLOBAL.downloadedPosts(): diff --git a/src/jsonHelper.py b/src/jsonHelper.py index 3f990f0..aee4398 100644 --- a/src/jsonHelper.py +++ b/src/jsonHelper.py @@ -1,5 +1,5 @@ import json -from os import path, remove +import os from src.errors import InvalidJSONFile @@ -12,19 +12,19 @@ class JsonFile: file_dir = "" - def __init__(self, file_dir): + def __init__(self, file_dir: str): self.file_dir = file_dir - if not path.exists(self.file_dir): + if not os.path.exists(self.file_dir): self.__writeToFile({}, create=True) - def read(self): + def read(self) -> dict: try: with open(self.file_dir, 'r') as f: return json.load(f) except json.decoder.JSONDecodeError: raise InvalidJSONFile(f"{self.file_dir} cannot be read") - def add(self, to_be_added, sub=None): + def add(self, to_be_added: dict, sub=None) -> dict: """Takes a dictionary and merges it with json file. It uses new key's value if a key already exists. Returns the new content as a dictionary. @@ -37,7 +37,7 @@ class JsonFile: self.__writeToFile(data) return self.read() - def delete(self, *delete_keys): + def delete(self, *delete_keys: str): """Delete given keys from JSON file. Returns the new content as a dictionary. """ @@ -50,8 +50,8 @@ class JsonFile: return False self.__writeToFile(data) - def __writeToFile(self, content, create=False): + def __writeToFile(self, content: (dict, list, tuple), create: bool = False): if not create: - remove(self.file_dir) + os.remove(self.file_dir) with open(self.file_dir, 'w') as f: json.dump(content, f, indent=4) diff --git a/src/parser.py b/src/parser.py index 1da6436..f319cb6 100644 --- a/src/parser.py +++ b/src/parser.py @@ -6,7 +6,7 @@ except ModuleNotFoundError: from errors import InvalidRedditLink -def QueryParser(passed_queries, index): +def QueryParser(passed_queries: str) -> dict: extracted_queries = {} question_mark_index = passed_queries.index("?") @@ -26,7 +26,7 @@ def QueryParser(passed_queries, index): return extracted_queries -def LinkParser(link): +def LinkParser(link: str) -> dict: result = {} short_link = False @@ -81,9 +81,7 @@ def LinkParser(link): result["user"] = "me" for index in range(len(splitted_link)): - if splitted_link[index] in [ - "hot", "top", "new", "controversial", "rising" - ]: + if splitted_link[index] in ["hot", "top", "new", "controversial", "rising"]: result["sort"] = splitted_link[index] @@ -101,7 +99,7 @@ def LinkParser(link): result["upvoted"] = True elif "?" in splitted_link[index]: - parsed_query = QueryParser(splitted_link[index], index) + parsed_query = QueryParser(splitted_link[index]) if parsed_query["HEADER"] == "search": del parsed_query["HEADER"] result["search"] = parsed_query @@ -124,7 +122,7 @@ def LinkParser(link): return result -def LinkDesigner(link): +def LinkDesigner(link) -> dict: attributes = LinkParser(link) mode = {} diff --git a/src/programMode.py b/src/programMode.py index 5fdd0ba..b094458 100644 --- a/src/programMode.py +++ b/src/programMode.py @@ -3,15 +3,16 @@ from pathlib import Path from src.errors import InvalidSortingType, ProgramModeError, RedditorNameError, SearchModeError from src.parser import LinkDesigner +import argparse class ProgramMode: - def __init__(self, arguments): + def __init__(self, arguments: argparse.Namespace): self.arguments = arguments - def generate(self): + def generate(self) -> dict: try: self._validateProgramMode() except ProgramModeError: @@ -82,7 +83,7 @@ class ProgramMode: return program_mode @staticmethod - def _chooseFrom(choices): + def _chooseFrom(choices: list[str]): print() choices_by_index = list(str(x) for x in range(len(choices) + 1)) for i in range(len(choices)): diff --git a/src/reddit.py b/src/reddit.py index a953c87..87a81f0 100644 --- a/src/reddit.py +++ b/src/reddit.py @@ -13,7 +13,7 @@ from src.utils import GLOBAL class Reddit: - def __init__(self, refresh_token=None): + def __init__(self, refresh_token: str = None): self.SCOPES = ['identity', 'history', 'read', 'save'] self.PORT = 7634 self.refresh_token = refresh_token @@ -24,7 +24,7 @@ class Reddit: "user_agent": str(socket.gethostname()) } - def begin(self): + def begin(self) -> praw.Reddit: if self.refresh_token: self.arguments["refresh_token"] = self.refresh_token self.redditInstance = praw.Reddit(**self.arguments) @@ -46,7 +46,7 @@ class Reddit: reddit.user.me()), "reddit": refresh_token}, "credentials") return self.redditInstance - def recieve_connection(self): + def recieve_connection(self) -> socket: """Wait for and then return a connected socket.. Opens a TCP connection on port 8080, and waits for a single client. """ @@ -58,13 +58,12 @@ class Reddit: server.close() return client - @staticmethod - def send_message(client, message): + def send_message(self, client: socket, message: str): """Send message to client and close the connection.""" client.send('HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8')) client.close() - def getRefreshToken(self, *scopes): + def getRefreshToken(self, scopes: list[str]) -> tuple[praw.Reddit, str]: state = str(random.randint(0, 65000)) url = self.redditInstance.auth.url(scopes, state, 'permanent') print("---Setting up the Reddit API---\n") diff --git a/src/searcher.py b/src/searcher.py index 720ba38..2a2c3d0 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -8,12 +8,14 @@ from prawcore.exceptions import Forbidden, NotFound from src.errors import (InsufficientPermission, InvalidSortingType, MultiredditNotFound, NoMatchingSubmissionFound, NoPrawSupport) from src.reddit import Reddit +from praw.models.listing.generator import ListingGenerator from src.utils import GLOBAL, createLogFile, printToFile +from praw.models import Submission print = printToFile -def getPosts(program_mode): +def getPosts(program_mode: dict) -> list[dict]: """Call PRAW regarding to arguments and pass it to extractDetails. Return what extractDetails has returned. """ @@ -180,7 +182,7 @@ def getPosts(program_mode): return extractDetails(reddit.submission(url=program_mode["post"]), single_post=True) -def extractDetails(posts, single_post=False): +def extractDetails(posts: (ListingGenerator, Submission), single_post=False) -> list[dict]: """Check posts and decide if it can be downloaded. If so, create a dictionary with post details and append them to a list. Write all of posts to file. Return the list @@ -270,7 +272,7 @@ def extractDetails(posts, single_post=False): raise NoMatchingSubmissionFound("No matching submission was found") -def matchWithDownloader(submission): +def matchWithDownloader(submission: Submission) -> dict[str, str]: direct_link = extractDirectLink(submission.url) if direct_link: return {'TYPE': 'direct', 'CONTENTURL': direct_link} @@ -320,7 +322,7 @@ def matchWithDownloader(submission): 'CONTENT': submission.selftext} -def extractDirectLink(url): +def extractDirectLink(url: str) -> (bool, str): """Check if link is a direct image link. If so, return URL, if not, return False diff --git a/src/store.py b/src/store.py index 2f24afa..79cdf43 100644 --- a/src/store.py +++ b/src/store.py @@ -2,7 +2,7 @@ from os import path class Store: - def __init__(self, directory=None): + def __init__(self, directory: str = None): self.directory = directory if self.directory: if path.exists(directory): @@ -15,10 +15,10 @@ class Store: else: self.list = [] - def __call__(self): + def __call__(self) -> list: return self.list - def add(self, data): + def add(self, data: dict): self.list.append(data) if self.directory: with open(self.directory, 'a') as f: diff --git a/src/utils.py b/src/utils.py index 5debfe9..731307b 100644 --- a/src/utils.py +++ b/src/utils.py @@ -2,6 +2,7 @@ import io import sys from os import makedirs, path from pathlib import Path +from typing import Optional from src.jsonHelper import JsonFile @@ -20,11 +21,11 @@ class GLOBAL: log_stream = None @staticmethod - def downloadedPosts(): + def downloadedPosts() -> list: return [] -def createLogFile(title): +def createLogFile(title: str) -> JsonFile: """Create a log file with given name inside a folder time stampt in its name and put given arguments inside \"HEADER\" key @@ -60,7 +61,7 @@ def printToFile(*args, no_print=False, **kwargs): print(*args, file=FILE, **kwargs) -def nameCorrector(string, reference=None): +def nameCorrector(string: str, reference: Optional[str] = None) -> str: """Swap strange characters from given string with underscore (_) and shorten it. Return the string From d8a1204d8b8236d5d01562db0d48f633d91c3a89 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 11:05:18 +1000 Subject: [PATCH 003/276] Move to standard module structure --- .gitignore | 2 +- {src => bulkredditdownloader}/__init__.py | 0 script.py => bulkredditdownloader/__main__.py | 47 +++++++++---------- {src => bulkredditdownloader}/arguments.py | 0 {src => bulkredditdownloader}/config.py | 6 +-- .../downloaders/Direct.py | 4 +- .../downloaders/Erome.py | 8 ++-- .../downloaders/Gfycat.py | 8 ++-- .../downloaders/Imgur.py | 12 ++--- .../downloaders/__init__.py | 0 .../downloaders/downloaderUtils.py | 6 +-- .../downloaders/gallery.py | 10 ++-- .../downloaders/gifDeliveryNetwork.py | 6 +-- .../downloaders/redgifs.py | 6 +-- .../downloaders/selfPost.py | 6 +-- .../downloaders/vreddit.py | 6 +-- .../downloaders/youtube.py | 8 ++-- {src => bulkredditdownloader}/errors.py | 0 {src => bulkredditdownloader}/jsonHelper.py | 2 +- {src => bulkredditdownloader}/parser.py | 2 +- {src => bulkredditdownloader}/programMode.py | 4 +- {src => bulkredditdownloader}/reddit.py | 6 +-- {src => bulkredditdownloader}/searcher.py | 8 ++-- {src => bulkredditdownloader}/store.py | 0 {src => bulkredditdownloader}/utils.py | 2 +- setup.py | 2 +- 26 files changed, 80 insertions(+), 81 deletions(-) rename {src => bulkredditdownloader}/__init__.py (100%) rename script.py => bulkredditdownloader/__main__.py (87%) rename {src => bulkredditdownloader}/arguments.py (100%) rename {src => bulkredditdownloader}/config.py (96%) rename {src => bulkredditdownloader}/downloaders/Direct.py (77%) rename {src => bulkredditdownloader}/downloaders/Erome.py (93%) rename {src => bulkredditdownloader}/downloaders/Gfycat.py (84%) rename {src => bulkredditdownloader}/downloaders/Imgur.py (91%) rename {src => bulkredditdownloader}/downloaders/__init__.py (100%) rename {src => bulkredditdownloader}/downloaders/downloaderUtils.py (94%) rename {src => bulkredditdownloader}/downloaders/gallery.py (91%) rename {src => bulkredditdownloader}/downloaders/gifDeliveryNetwork.py (88%) rename {src => bulkredditdownloader}/downloaders/redgifs.py (89%) rename {src => bulkredditdownloader}/downloaders/selfPost.py (90%) rename {src => bulkredditdownloader}/downloaders/vreddit.py (92%) rename {src => bulkredditdownloader}/downloaders/youtube.py (86%) rename {src => bulkredditdownloader}/errors.py (100%) rename {src => bulkredditdownloader}/jsonHelper.py (96%) rename {src => bulkredditdownloader}/parser.py (99%) rename {src => bulkredditdownloader}/programMode.py (98%) rename {src => bulkredditdownloader}/reddit.py (95%) rename {src => bulkredditdownloader}/searcher.py (97%) rename {src => bulkredditdownloader}/store.py (100%) rename {src => bulkredditdownloader}/utils.py (97%) diff --git a/.gitignore b/.gitignore index 1ab73fc..06bac55 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ build/ dist/ MANIFEST __pycache__/ -src/__pycache__/ +bulkredditdownloader/__pycache__/ config.json env/ .vscode/ \ No newline at end of file diff --git a/src/__init__.py b/bulkredditdownloader/__init__.py similarity index 100% rename from src/__init__.py rename to bulkredditdownloader/__init__.py diff --git a/script.py b/bulkredditdownloader/__main__.py similarity index 87% rename from script.py rename to bulkredditdownloader/__main__.py index 0053b06..bdc053f 100644 --- a/script.py +++ b/bulkredditdownloader/__main__.py @@ -12,28 +12,27 @@ from io import StringIO from pathlib import Path from prawcore.exceptions import InsufficientScope -from src.arguments import Arguments -from src.config import Config -from src.downloaders.Direct import Direct -from src.downloaders.Erome import Erome -from src.downloaders.gallery import Gallery -from src.downloaders.Gfycat import Gfycat -from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork -from src.downloaders.Imgur import Imgur -from src.downloaders.Gallery import Gallery -from src.downloaders.redgifs import Redgifs -from src.downloaders.selfPost import SelfPost -from src.downloaders.vreddit import VReddit -from src.downloaders.youtube import Youtube -from src.errors import (AlbumNotDownloadedCompletely, DomainInSkip, FailedToDownload, FileAlreadyExistsError, - ImgurLimitError, ImgurLoginError, InvalidJSONFile, NoSuitablePost, NotADownloadableLinkError, - TypeInSkip, full_exc_info) -from src.jsonHelper import JsonFile -from src.programMode import ProgramMode -from src.reddit import Reddit -from src.searcher import getPosts -from src.store import Store -from src.utils import GLOBAL, createLogFile, nameCorrector, printToFile +from bulkredditdownloader.arguments import Arguments +from bulkredditdownloader.config import Config +from bulkredditdownloader.downloaders.Direct import Direct +from bulkredditdownloader.downloaders.Erome import Erome +from bulkredditdownloader.downloaders.gallery import Gallery +from bulkredditdownloader.downloaders.Gfycat import Gfycat +from bulkredditdownloader.downloaders.gifDeliveryNetwork import GifDeliveryNetwork +from bulkredditdownloader.downloaders.Imgur import Imgur +from bulkredditdownloader.downloaders.redgifs import Redgifs +from bulkredditdownloader.downloaders.selfPost import SelfPost +from bulkredditdownloader.downloaders.vreddit import VReddit +from bulkredditdownloader.downloaders.youtube import Youtube +from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, DomainInSkip, FailedToDownload, FileAlreadyExistsError, + ImgurLimitError, ImgurLoginError, InvalidJSONFile, NoSuitablePost, NotADownloadableLinkError, + TypeInSkip, full_exc_info) +from bulkredditdownloader.jsonHelper import JsonFile +from bulkredditdownloader.programMode import ProgramMode +from bulkredditdownloader.reddit import Reddit +from bulkredditdownloader.searcher import getPosts +from bulkredditdownloader.store import Store +from bulkredditdownloader.utils import GLOBAL, createLogFile, nameCorrector, printToFile from time import sleep @@ -330,11 +329,11 @@ if __name__ == "__main__": except KeyboardInterrupt: if GLOBAL.directory is None: - GLOBAL.directory = Path("..\\") + GLOBAL.directory = Path("../..\\") except Exception as exception: if GLOBAL.directory is None: - GLOBAL.directory = Path("..\\") + GLOBAL.directory = Path("../..\\") logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) print(GLOBAL.log_stream.getvalue()) diff --git a/src/arguments.py b/bulkredditdownloader/arguments.py similarity index 100% rename from src/arguments.py rename to bulkredditdownloader/arguments.py diff --git a/src/config.py b/bulkredditdownloader/config.py similarity index 96% rename from src/config.py rename to bulkredditdownloader/config.py index 2daaa64..eacfe97 100644 --- a/src/config.py +++ b/bulkredditdownloader/config.py @@ -1,6 +1,6 @@ -from src.reddit import Reddit -from src.jsonHelper import JsonFile -from src.utils import nameCorrector +from bulkredditdownloader.reddit import Reddit +from bulkredditdownloader.jsonHelper import JsonFile +from bulkredditdownloader.utils import nameCorrector class Config: diff --git a/src/downloaders/Direct.py b/bulkredditdownloader/downloaders/Direct.py similarity index 77% rename from src/downloaders/Direct.py rename to bulkredditdownloader/downloaders/Direct.py index 7f89081..20aa09e 100644 --- a/src/downloaders/Direct.py +++ b/bulkredditdownloader/downloaders/Direct.py @@ -1,8 +1,8 @@ import os import pathlib -from src.downloaders.downloaderUtils import getExtension, getFile -from src.utils import GLOBAL +from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.utils import GLOBAL class Direct: diff --git a/src/downloaders/Erome.py b/bulkredditdownloader/downloaders/Erome.py similarity index 93% rename from src/downloaders/Erome.py rename to bulkredditdownloader/downloaders/Erome.py index f52bef3..40fc625 100644 --- a/src/downloaders/Erome.py +++ b/bulkredditdownloader/downloaders/Erome.py @@ -4,10 +4,10 @@ import urllib.error import urllib.request from html.parser import HTMLParser -from src.downloaders.downloaderUtils import getExtension, getFile -from src.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError -from src.utils import GLOBAL -from src.utils import printToFile as print +from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print class Erome: diff --git a/src/downloaders/Gfycat.py b/bulkredditdownloader/downloaders/Gfycat.py similarity index 84% rename from src/downloaders/Gfycat.py rename to bulkredditdownloader/downloaders/Gfycat.py index 7e06f51..1e6b09a 100644 --- a/src/downloaders/Gfycat.py +++ b/bulkredditdownloader/downloaders/Gfycat.py @@ -4,10 +4,10 @@ import urllib.request from bs4 import BeautifulSoup -from src.downloaders.downloaderUtils import getExtension, getFile -from src.downloaders.gifDeliveryNetwork import GifDeliveryNetwork -from src.errors import NotADownloadableLinkError -from src.utils import GLOBAL +from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.downloaders.gifDeliveryNetwork import GifDeliveryNetwork +from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.utils import GLOBAL import pathlib diff --git a/src/downloaders/Imgur.py b/bulkredditdownloader/downloaders/Imgur.py similarity index 91% rename from src/downloaders/Imgur.py rename to bulkredditdownloader/downloaders/Imgur.py index 32233c6..f4e6ff4 100644 --- a/src/downloaders/Imgur.py +++ b/bulkredditdownloader/downloaders/Imgur.py @@ -4,12 +4,12 @@ import pathlib import requests -from src.downloaders.Direct import Direct -from src.downloaders.downloaderUtils import getFile -from src.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, - NotADownloadableLinkError, TypeInSkip) -from src.utils import GLOBAL, nameCorrector -from src.utils import printToFile as print +from bulkredditdownloader.downloaders.Direct import Direct +from bulkredditdownloader.downloaders.downloaderUtils import getFile +from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, + NotADownloadableLinkError, TypeInSkip) +from bulkredditdownloader.utils import GLOBAL, nameCorrector +from bulkredditdownloader.utils import printToFile as print class Imgur: diff --git a/src/downloaders/__init__.py b/bulkredditdownloader/downloaders/__init__.py similarity index 100% rename from src/downloaders/__init__.py rename to bulkredditdownloader/downloaders/__init__.py diff --git a/src/downloaders/downloaderUtils.py b/bulkredditdownloader/downloaders/downloaderUtils.py similarity index 94% rename from src/downloaders/downloaderUtils.py rename to bulkredditdownloader/downloaders/downloaderUtils.py index c94f1d4..e5d1043 100644 --- a/src/downloaders/downloaderUtils.py +++ b/bulkredditdownloader/downloaders/downloaderUtils.py @@ -4,9 +4,9 @@ import sys import urllib.request from pathlib import Path -from src.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip -from src.utils import GLOBAL -from src.utils import printToFile as print +from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print def dlProgress(count: int, block_size: int, total_size: int): diff --git a/src/downloaders/gallery.py b/bulkredditdownloader/downloaders/gallery.py similarity index 91% rename from src/downloaders/gallery.py rename to bulkredditdownloader/downloaders/gallery.py index 4f9a1c4..18b220d 100644 --- a/src/downloaders/gallery.py +++ b/bulkredditdownloader/downloaders/gallery.py @@ -5,11 +5,11 @@ import urllib import requests import pathlib -from src.downloaders.downloaderUtils import getFile -from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, - TypeInSkip) -from src.utils import GLOBAL -from src.utils import printToFile as print +from bulkredditdownloader.downloaders.downloaderUtils import getFile +from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, + TypeInSkip) +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print class Gallery: diff --git a/src/downloaders/gifDeliveryNetwork.py b/bulkredditdownloader/downloaders/gifDeliveryNetwork.py similarity index 88% rename from src/downloaders/gifDeliveryNetwork.py rename to bulkredditdownloader/downloaders/gifDeliveryNetwork.py index 666fd9f..86fd4a9 100644 --- a/src/downloaders/gifDeliveryNetwork.py +++ b/bulkredditdownloader/downloaders/gifDeliveryNetwork.py @@ -4,9 +4,9 @@ import urllib.request from bs4 import BeautifulSoup -from src.downloaders.downloaderUtils import getExtension, getFile -from src.errors import NotADownloadableLinkError -from src.utils import GLOBAL +from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.utils import GLOBAL class GifDeliveryNetwork: diff --git a/src/downloaders/redgifs.py b/bulkredditdownloader/downloaders/redgifs.py similarity index 89% rename from src/downloaders/redgifs.py rename to bulkredditdownloader/downloaders/redgifs.py index a0af7b7..257f25b 100644 --- a/src/downloaders/redgifs.py +++ b/bulkredditdownloader/downloaders/redgifs.py @@ -5,9 +5,9 @@ import urllib.request from bs4 import BeautifulSoup -from src.downloaders.downloaderUtils import getExtension, getFile -from src.errors import NotADownloadableLinkError -from src.utils import GLOBAL +from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.utils import GLOBAL class Redgifs: diff --git a/src/downloaders/selfPost.py b/bulkredditdownloader/downloaders/selfPost.py similarity index 90% rename from src/downloaders/selfPost.py rename to bulkredditdownloader/downloaders/selfPost.py index 32801c9..fa03e7b 100644 --- a/src/downloaders/selfPost.py +++ b/bulkredditdownloader/downloaders/selfPost.py @@ -4,9 +4,9 @@ import os import pathlib from pathlib import Path -from src.errors import FileAlreadyExistsError, TypeInSkip -from src.utils import GLOBAL -from src.utils import printToFile as print +from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print VanillaPrint = print diff --git a/src/downloaders/vreddit.py b/bulkredditdownloader/downloaders/vreddit.py similarity index 92% rename from src/downloaders/vreddit.py rename to bulkredditdownloader/downloaders/vreddit.py index bf5e98a..967c467 100644 --- a/src/downloaders/vreddit.py +++ b/bulkredditdownloader/downloaders/vreddit.py @@ -2,9 +2,9 @@ import os import pathlib import subprocess -from src.downloaders.downloaderUtils import getFile -from src.utils import GLOBAL -from src.utils import printToFile as print +from bulkredditdownloader.downloaders.downloaderUtils import getFile +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print class VReddit: diff --git a/src/downloaders/youtube.py b/bulkredditdownloader/downloaders/youtube.py similarity index 86% rename from src/downloaders/youtube.py rename to bulkredditdownloader/downloaders/youtube.py index 9df78b1..f482d79 100644 --- a/src/downloaders/youtube.py +++ b/bulkredditdownloader/downloaders/youtube.py @@ -4,10 +4,10 @@ import sys import youtube_dl -from src.downloaders.downloaderUtils import createHash -from src.errors import FileAlreadyExistsError -from src.utils import GLOBAL -from src.utils import printToFile as print +from bulkredditdownloader.downloaders.downloaderUtils import createHash +from bulkredditdownloader.errors import FileAlreadyExistsError +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print diff --git a/src/errors.py b/bulkredditdownloader/errors.py similarity index 100% rename from src/errors.py rename to bulkredditdownloader/errors.py diff --git a/src/jsonHelper.py b/bulkredditdownloader/jsonHelper.py similarity index 96% rename from src/jsonHelper.py rename to bulkredditdownloader/jsonHelper.py index aee4398..5f3f7bb 100644 --- a/src/jsonHelper.py +++ b/bulkredditdownloader/jsonHelper.py @@ -1,7 +1,7 @@ import json import os -from src.errors import InvalidJSONFile +from bulkredditdownloader.errors import InvalidJSONFile class JsonFile: diff --git a/src/parser.py b/bulkredditdownloader/parser.py similarity index 99% rename from src/parser.py rename to bulkredditdownloader/parser.py index f319cb6..e8a38f7 100644 --- a/src/parser.py +++ b/bulkredditdownloader/parser.py @@ -1,7 +1,7 @@ from pprint import pprint try: - from src.errors import InvalidRedditLink + from bulkredditdownloader.errors import InvalidRedditLink except ModuleNotFoundError: from errors import InvalidRedditLink diff --git a/src/programMode.py b/bulkredditdownloader/programMode.py similarity index 98% rename from src/programMode.py rename to bulkredditdownloader/programMode.py index b094458..f2361ac 100644 --- a/src/programMode.py +++ b/bulkredditdownloader/programMode.py @@ -1,8 +1,8 @@ import sys from pathlib import Path -from src.errors import InvalidSortingType, ProgramModeError, RedditorNameError, SearchModeError -from src.parser import LinkDesigner +from bulkredditdownloader.errors import InvalidSortingType, ProgramModeError, RedditorNameError, SearchModeError +from bulkredditdownloader.parser import LinkDesigner import argparse diff --git a/src/reddit.py b/bulkredditdownloader/reddit.py similarity index 95% rename from src/reddit.py rename to bulkredditdownloader/reddit.py index 87a81f0..1bfb261 100644 --- a/src/reddit.py +++ b/bulkredditdownloader/reddit.py @@ -5,9 +5,9 @@ import webbrowser import praw from prawcore.exceptions import ResponseException -from src.errors import RedditLoginFailed -from src.jsonHelper import JsonFile -from src.utils import GLOBAL +from bulkredditdownloader.errors import RedditLoginFailed +from bulkredditdownloader.jsonHelper import JsonFile +from bulkredditdownloader.utils import GLOBAL diff --git a/src/searcher.py b/bulkredditdownloader/searcher.py similarity index 97% rename from src/searcher.py rename to bulkredditdownloader/searcher.py index 2a2c3d0..19bf1d3 100644 --- a/src/searcher.py +++ b/bulkredditdownloader/searcher.py @@ -5,11 +5,11 @@ from urllib.error import HTTPError from prawcore.exceptions import Forbidden, NotFound -from src.errors import (InsufficientPermission, InvalidSortingType, MultiredditNotFound, NoMatchingSubmissionFound, - NoPrawSupport) -from src.reddit import Reddit +from bulkredditdownloader.errors import (InsufficientPermission, InvalidSortingType, MultiredditNotFound, NoMatchingSubmissionFound, + NoPrawSupport) +from bulkredditdownloader.reddit import Reddit from praw.models.listing.generator import ListingGenerator -from src.utils import GLOBAL, createLogFile, printToFile +from bulkredditdownloader.utils import GLOBAL, createLogFile, printToFile from praw.models import Submission print = printToFile diff --git a/src/store.py b/bulkredditdownloader/store.py similarity index 100% rename from src/store.py rename to bulkredditdownloader/store.py diff --git a/src/utils.py b/bulkredditdownloader/utils.py similarity index 97% rename from src/utils.py rename to bulkredditdownloader/utils.py index 731307b..ec39a29 100644 --- a/src/utils.py +++ b/bulkredditdownloader/utils.py @@ -4,7 +4,7 @@ from os import makedirs, path from pathlib import Path from typing import Optional -from src.jsonHelper import JsonFile +from bulkredditdownloader.jsonHelper import JsonFile class GLOBAL: diff --git a/setup.py b/setup.py index cb687bd..ab78f46 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ ## python setup.py build import sys from cx_Freeze import setup, Executable -from script import __version__ +from bulkredditdownloader.__main__ import __version__ options = { "build_exe": { From 12b5fd351e173a5e72bb225566e7e91cf96fe297 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 11:06:05 +1000 Subject: [PATCH 004/276] Update gitignore to Python standard --- .gitignore | 147 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 138 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 06bac55..a81c8ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,138 @@ -.DS_Store -build/ -dist/ -MANIFEST -__pycache__/ -bulkredditdownloader/__pycache__/ -config.json -env/ -.vscode/ \ No newline at end of file +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ From be613949fe48bfd8637a6be0b0a3c2ec20526961 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 11:09:31 +1000 Subject: [PATCH 005/276] Rename files to conform to PEP8 --- bulkredditdownloader/__main__.py | 16 ++++++++-------- bulkredditdownloader/config.py | 2 +- .../downloaders/{Direct.py => direct.py} | 2 +- .../{downloaderUtils.py => downloader_utils.py} | 0 .../downloaders/{Erome.py => erome.py} | 2 +- bulkredditdownloader/downloaders/gallery.py | 2 +- .../downloaders/{Gfycat.py => gfycat.py} | 4 ++-- ...eliveryNetwork.py => gif_delivery_network.py} | 2 +- .../downloaders/{Imgur.py => imgur.py} | 4 ++-- bulkredditdownloader/downloaders/redgifs.py | 2 +- .../downloaders/{selfPost.py => self_post.py} | 0 bulkredditdownloader/downloaders/vreddit.py | 2 +- bulkredditdownloader/downloaders/youtube.py | 2 +- .../{jsonHelper.py => json_helper.py} | 0 .../{programMode.py => program_mode.py} | 0 bulkredditdownloader/reddit.py | 2 +- bulkredditdownloader/utils.py | 2 +- 17 files changed, 22 insertions(+), 22 deletions(-) rename bulkredditdownloader/downloaders/{Direct.py => direct.py} (85%) rename bulkredditdownloader/downloaders/{downloaderUtils.py => downloader_utils.py} (100%) rename bulkredditdownloader/downloaders/{Erome.py => erome.py} (98%) rename bulkredditdownloader/downloaders/{Gfycat.py => gfycat.py} (90%) rename bulkredditdownloader/downloaders/{gifDeliveryNetwork.py => gif_delivery_network.py} (95%) rename bulkredditdownloader/downloaders/{Imgur.py => imgur.py} (97%) rename bulkredditdownloader/downloaders/{selfPost.py => self_post.py} (100%) rename bulkredditdownloader/{jsonHelper.py => json_helper.py} (100%) rename bulkredditdownloader/{programMode.py => program_mode.py} (100%) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index bdc053f..773f719 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -14,21 +14,21 @@ from prawcore.exceptions import InsufficientScope from bulkredditdownloader.arguments import Arguments from bulkredditdownloader.config import Config -from bulkredditdownloader.downloaders.Direct import Direct -from bulkredditdownloader.downloaders.Erome import Erome +from bulkredditdownloader.downloaders.direct import Direct +from bulkredditdownloader.downloaders.erome import Erome from bulkredditdownloader.downloaders.gallery import Gallery -from bulkredditdownloader.downloaders.Gfycat import Gfycat -from bulkredditdownloader.downloaders.gifDeliveryNetwork import GifDeliveryNetwork -from bulkredditdownloader.downloaders.Imgur import Imgur +from bulkredditdownloader.downloaders.gfycat import Gfycat +from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork +from bulkredditdownloader.downloaders.imgur import Imgur from bulkredditdownloader.downloaders.redgifs import Redgifs -from bulkredditdownloader.downloaders.selfPost import SelfPost +from bulkredditdownloader.downloaders.self_post import SelfPost from bulkredditdownloader.downloaders.vreddit import VReddit from bulkredditdownloader.downloaders.youtube import Youtube from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, DomainInSkip, FailedToDownload, FileAlreadyExistsError, ImgurLimitError, ImgurLoginError, InvalidJSONFile, NoSuitablePost, NotADownloadableLinkError, TypeInSkip, full_exc_info) -from bulkredditdownloader.jsonHelper import JsonFile -from bulkredditdownloader.programMode import ProgramMode +from bulkredditdownloader.json_helper import JsonFile +from bulkredditdownloader.program_mode import ProgramMode from bulkredditdownloader.reddit import Reddit from bulkredditdownloader.searcher import getPosts from bulkredditdownloader.store import Store diff --git a/bulkredditdownloader/config.py b/bulkredditdownloader/config.py index eacfe97..36dec10 100644 --- a/bulkredditdownloader/config.py +++ b/bulkredditdownloader/config.py @@ -1,5 +1,5 @@ from bulkredditdownloader.reddit import Reddit -from bulkredditdownloader.jsonHelper import JsonFile +from bulkredditdownloader.json_helper import JsonFile from bulkredditdownloader.utils import nameCorrector diff --git a/bulkredditdownloader/downloaders/Direct.py b/bulkredditdownloader/downloaders/direct.py similarity index 85% rename from bulkredditdownloader/downloaders/Direct.py rename to bulkredditdownloader/downloaders/direct.py index 20aa09e..9dd2c67 100644 --- a/bulkredditdownloader/downloaders/Direct.py +++ b/bulkredditdownloader/downloaders/direct.py @@ -1,7 +1,7 @@ import os import pathlib -from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/downloaderUtils.py b/bulkredditdownloader/downloaders/downloader_utils.py similarity index 100% rename from bulkredditdownloader/downloaders/downloaderUtils.py rename to bulkredditdownloader/downloaders/downloader_utils.py diff --git a/bulkredditdownloader/downloaders/Erome.py b/bulkredditdownloader/downloaders/erome.py similarity index 98% rename from bulkredditdownloader/downloaders/Erome.py rename to bulkredditdownloader/downloaders/erome.py index 40fc625..2710453 100644 --- a/bulkredditdownloader/downloaders/Erome.py +++ b/bulkredditdownloader/downloaders/erome.py @@ -4,7 +4,7 @@ import urllib.error import urllib.request from html.parser import HTMLParser -from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print diff --git a/bulkredditdownloader/downloaders/gallery.py b/bulkredditdownloader/downloaders/gallery.py index 18b220d..e3ec461 100644 --- a/bulkredditdownloader/downloaders/gallery.py +++ b/bulkredditdownloader/downloaders/gallery.py @@ -5,7 +5,7 @@ import urllib import requests import pathlib -from bulkredditdownloader.downloaders.downloaderUtils import getFile +from bulkredditdownloader.downloaders.downloader_utils import getFile from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/Gfycat.py b/bulkredditdownloader/downloaders/gfycat.py similarity index 90% rename from bulkredditdownloader/downloaders/Gfycat.py rename to bulkredditdownloader/downloaders/gfycat.py index 1e6b09a..7ab93b4 100644 --- a/bulkredditdownloader/downloaders/Gfycat.py +++ b/bulkredditdownloader/downloaders/gfycat.py @@ -4,8 +4,8 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile -from bulkredditdownloader.downloaders.gifDeliveryNetwork import GifDeliveryNetwork +from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL import pathlib diff --git a/bulkredditdownloader/downloaders/gifDeliveryNetwork.py b/bulkredditdownloader/downloaders/gif_delivery_network.py similarity index 95% rename from bulkredditdownloader/downloaders/gifDeliveryNetwork.py rename to bulkredditdownloader/downloaders/gif_delivery_network.py index 86fd4a9..486bc9a 100644 --- a/bulkredditdownloader/downloaders/gifDeliveryNetwork.py +++ b/bulkredditdownloader/downloaders/gif_delivery_network.py @@ -4,7 +4,7 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/Imgur.py b/bulkredditdownloader/downloaders/imgur.py similarity index 97% rename from bulkredditdownloader/downloaders/Imgur.py rename to bulkredditdownloader/downloaders/imgur.py index f4e6ff4..9b444d0 100644 --- a/bulkredditdownloader/downloaders/Imgur.py +++ b/bulkredditdownloader/downloaders/imgur.py @@ -4,8 +4,8 @@ import pathlib import requests -from bulkredditdownloader.downloaders.Direct import Direct -from bulkredditdownloader.downloaders.downloaderUtils import getFile +from bulkredditdownloader.downloaders.direct import Direct +from bulkredditdownloader.downloaders.downloader_utils import getFile from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL, nameCorrector diff --git a/bulkredditdownloader/downloaders/redgifs.py b/bulkredditdownloader/downloaders/redgifs.py index 257f25b..6d504c5 100644 --- a/bulkredditdownloader/downloaders/redgifs.py +++ b/bulkredditdownloader/downloaders/redgifs.py @@ -5,7 +5,7 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile +from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/selfPost.py b/bulkredditdownloader/downloaders/self_post.py similarity index 100% rename from bulkredditdownloader/downloaders/selfPost.py rename to bulkredditdownloader/downloaders/self_post.py diff --git a/bulkredditdownloader/downloaders/vreddit.py b/bulkredditdownloader/downloaders/vreddit.py index 967c467..74b776c 100644 --- a/bulkredditdownloader/downloaders/vreddit.py +++ b/bulkredditdownloader/downloaders/vreddit.py @@ -2,7 +2,7 @@ import os import pathlib import subprocess -from bulkredditdownloader.downloaders.downloaderUtils import getFile +from bulkredditdownloader.downloaders.downloader_utils import getFile from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print diff --git a/bulkredditdownloader/downloaders/youtube.py b/bulkredditdownloader/downloaders/youtube.py index f482d79..c5abc81 100644 --- a/bulkredditdownloader/downloaders/youtube.py +++ b/bulkredditdownloader/downloaders/youtube.py @@ -4,7 +4,7 @@ import sys import youtube_dl -from bulkredditdownloader.downloaders.downloaderUtils import createHash +from bulkredditdownloader.downloaders.downloader_utils import createHash from bulkredditdownloader.errors import FileAlreadyExistsError from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print diff --git a/bulkredditdownloader/jsonHelper.py b/bulkredditdownloader/json_helper.py similarity index 100% rename from bulkredditdownloader/jsonHelper.py rename to bulkredditdownloader/json_helper.py diff --git a/bulkredditdownloader/programMode.py b/bulkredditdownloader/program_mode.py similarity index 100% rename from bulkredditdownloader/programMode.py rename to bulkredditdownloader/program_mode.py diff --git a/bulkredditdownloader/reddit.py b/bulkredditdownloader/reddit.py index 1bfb261..392b860 100644 --- a/bulkredditdownloader/reddit.py +++ b/bulkredditdownloader/reddit.py @@ -6,7 +6,7 @@ import praw from prawcore.exceptions import ResponseException from bulkredditdownloader.errors import RedditLoginFailed -from bulkredditdownloader.jsonHelper import JsonFile +from bulkredditdownloader.json_helper import JsonFile from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/utils.py b/bulkredditdownloader/utils.py index ec39a29..f63f159 100644 --- a/bulkredditdownloader/utils.py +++ b/bulkredditdownloader/utils.py @@ -4,7 +4,7 @@ from os import makedirs, path from pathlib import Path from typing import Optional -from bulkredditdownloader.jsonHelper import JsonFile +from bulkredditdownloader.json_helper import JsonFile class GLOBAL: From 0d839329e5fe94411fd84b9b5731698da7f3b575 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 11:33:19 +1000 Subject: [PATCH 006/276] Remove utils module for downloaders --- .../downloaders/base_downloader.py | 121 ++++++++++++++++++ bulkredditdownloader/downloaders/direct.py | 9 +- .../downloaders/downloader_utils.py | 109 ---------------- bulkredditdownloader/downloaders/erome.py | 13 +- bulkredditdownloader/downloaders/gallery.py | 16 +-- bulkredditdownloader/downloaders/gfycat.py | 10 +- .../downloaders/gif_delivery_network.py | 9 +- bulkredditdownloader/downloaders/imgur.py | 16 +-- bulkredditdownloader/downloaders/redgifs.py | 9 +- bulkredditdownloader/downloaders/self_post.py | 4 +- bulkredditdownloader/downloaders/vreddit.py | 11 +- bulkredditdownloader/downloaders/youtube.py | 8 +- 12 files changed, 175 insertions(+), 160 deletions(-) create mode 100644 bulkredditdownloader/downloaders/base_downloader.py delete mode 100644 bulkredditdownloader/downloaders/downloader_utils.py diff --git a/bulkredditdownloader/downloaders/base_downloader.py b/bulkredditdownloader/downloaders/base_downloader.py new file mode 100644 index 0000000..297e31c --- /dev/null +++ b/bulkredditdownloader/downloaders/base_downloader.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# coding=utf-8 +import hashlib +import os +import sys +import urllib.request +from abc import ABC +from pathlib import Path + +from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip +from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.utils import printToFile as print + + +class BaseDownloader(ABC): + def __init__(self, directory: Path, post: dict): + self.directory = directory + self.post = post + + @staticmethod + def createHash(filename: str) -> str: + hash_md5 = hashlib.md5() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + @staticmethod + def dlProgress(count: int, block_size: int, total_size: int): + """Function for writing download progress to console """ + download_mbs = int(count * block_size * (10 ** (-6))) + file_size = int(total_size * (10 ** (-6))) + sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) + sys.stdout.flush() + + @staticmethod + def getFile( + filename: str, + short_filename: str, + folder_dir: Path, + image_url: str, + indent: int = 0, + silent: bool = False): + formats = { + "videos": [".mp4", ".webm"], + "images": [".jpg", ".jpeg", ".png", ".bmp"], + "gifs": [".gif"], + "self": [] + } + + for file_type in GLOBAL.arguments.skip: + for extension in formats[file_type]: + if extension in filename: + raise TypeInSkip + + if any(domain in image_url for domain in GLOBAL.arguments.skip_domain): + raise DomainInSkip + + headers = [ + ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " + "Safari/537.36 OPR/54.0.2952.64"), + ("Accept", "text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/webp,image/apng,*/*;q=0.8"), + ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), + ("Accept-Encoding", "none"), + ("Accept-Language", "en-US,en;q=0.8"), + ("Connection", "keep-alive") + ] + + if not os.path.exists(folder_dir): + os.makedirs(folder_dir) + + opener = urllib.request.build_opener() + if "imgur" not in image_url: + opener.addheaders = headers + urllib.request.install_opener(opener) + + if not silent: + print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") + + for i in range(3): + file_dir = Path(folder_dir) / filename + temp_dir = Path(folder_dir) / (filename + ".tmp") + + if not (os.path.isfile(file_dir)): + try: + urllib.request.urlretrieve(image_url, temp_dir, reporthook=BaseDownloader.dlProgress) + + file_hash = BaseDownloader.createHash(temp_dir) + if GLOBAL.arguments.no_dupes: + if file_hash in GLOBAL.downloadedPosts(): + os.remove(temp_dir) + raise FileAlreadyExistsError + GLOBAL.downloadedPosts.add(file_hash) + + os.rename(temp_dir, file_dir) + if not silent: + print(" " * indent + "Downloaded" + " " * 10) + return None + except ConnectionResetError: + raise FailedToDownload + except FileNotFoundError: + filename = short_filename + else: + raise FileAlreadyExistsError + raise FailedToDownload + + @staticmethod + def getExtension(link: str): + """Extract file extension from image link. If didn't find any, return '.jpg' """ + image_types = ['jpg', 'png', 'mp4', 'webm', 'gif'] + parsed = link.split('.') + for fileType in image_types: + if fileType in parsed: + return "." + parsed[-1] + else: + if "v.redd.it" not in link: + return '.jpg' + else: + return '.mp4' diff --git a/bulkredditdownloader/downloaders/direct.py b/bulkredditdownloader/downloaders/direct.py index 9dd2c67..23a5c2f 100644 --- a/bulkredditdownloader/downloaders/direct.py +++ b/bulkredditdownloader/downloaders/direct.py @@ -1,17 +1,18 @@ import os import pathlib -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.utils import GLOBAL -class Direct: +class Direct(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): - post['EXTENSION'] = getExtension(post['CONTENTURL']) + super().__init__(directory, post) + post['EXTENSION'] = self.getExtension(post['CONTENTURL']) if not os.path.exists(directory): os.makedirs(directory) filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['CONTENTURL']) + self.getFile(filename, short_filename, directory, post['CONTENTURL']) diff --git a/bulkredditdownloader/downloaders/downloader_utils.py b/bulkredditdownloader/downloaders/downloader_utils.py deleted file mode 100644 index e5d1043..0000000 --- a/bulkredditdownloader/downloaders/downloader_utils.py +++ /dev/null @@ -1,109 +0,0 @@ -import hashlib -import os -import sys -import urllib.request -from pathlib import Path - -from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip -from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print - - -def dlProgress(count: int, block_size: int, total_size: int): - """Function for writing download progress to console - """ - download_mbs = int(count * block_size * (10 ** (-6))) - file_size = int(total_size * (10 ** (-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) - sys.stdout.flush() - - -def getExtension(link: str): - """Extract file extension from image link. - If didn't find any, return '.jpg' - """ - image_types = ['jpg', 'png', 'mp4', 'webm', 'gif'] - parsed = link.split('.') - for fileType in image_types: - if fileType in parsed: - return "." + parsed[-1] - else: - if "v.redd.it" not in link: - return '.jpg' - else: - return '.mp4' - - -def getFile(filename: str, short_filename: str, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False): - formats = { - "videos": [".mp4", ".webm"], - "images": [".jpg", ".jpeg", ".png", ".bmp"], - "gifs": [".gif"], - "self": [] - } - - for file_type in GLOBAL.arguments.skip: - for extension in formats[file_type]: - if extension in filename: - raise TypeInSkip - - if any(domain in image_url for domain in GLOBAL.arguments.skip_domain): - raise DomainInSkip - - headers = [ - ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " - "Safari/537.36 OPR/54.0.2952.64"), - ("Accept", "text/html,application/xhtml+xml,application/xml;" - "q=0.9,image/webp,image/apng,*/*;q=0.8"), - ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), - ("Accept-Encoding", "none"), - ("Accept-Language", "en-US,en;q=0.8"), - ("Connection", "keep-alive") - ] - - if not os.path.exists(folder_dir): - os.makedirs(folder_dir) - - opener = urllib.request.build_opener() - if "imgur" not in image_url: - opener.addheaders = headers - urllib.request.install_opener(opener) - - if not silent: - print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") - - for i in range(3): - file_dir = Path(folder_dir) / filename - temp_dir = Path(folder_dir) / (filename + ".tmp") - - if not (os.path.isfile(file_dir)): - try: - urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress) - - file_hash = createHash(temp_dir) - if GLOBAL.arguments.no_dupes: - if file_hash in GLOBAL.downloadedPosts(): - os.remove(temp_dir) - raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(file_hash) - - os.rename(temp_dir, file_dir) - if not silent: - print(" " * indent + "Downloaded" + " " * 10) - return None - except ConnectionResetError: - raise FailedToDownload - except FileNotFoundError: - filename = short_filename - else: - raise FileAlreadyExistsError - raise FailedToDownload - - -def createHash(filename: str) -> str: - hash_md5 = hashlib.md5() - with open(filename, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() diff --git a/bulkredditdownloader/downloaders/erome.py b/bulkredditdownloader/downloaders/erome.py index 2710453..f54e6f5 100644 --- a/bulkredditdownloader/downloaders/erome.py +++ b/bulkredditdownloader/downloaders/erome.py @@ -4,14 +4,15 @@ import urllib.error import urllib.request from html.parser import HTMLParser -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print -class Erome: +class Erome(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: images = self.getLinks(post['CONTENTURL']) except urllib.error.HTTPError: @@ -22,7 +23,7 @@ class Erome: duplicates = 0 if images_length == 1: - extension = getExtension(images[0]) + extension = self.getExtension(images[0]) """Filenames are declared here""" filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] @@ -32,7 +33,7 @@ class Erome: if 'https://' not in image_url or 'http://' not in image_url: image_url = "https://" + image_url - getFile(filename, short_filename, directory, image_url) + self.getFile(filename, short_filename, directory, image_url) else: filename = GLOBAL.config['filename'].format(**post) @@ -48,7 +49,7 @@ class Erome: os.makedirs(folder_dir) for i in range(images_length): - extension = getExtension(images[i]) + extension = self.getExtension(images[i]) filename = str(i + 1) + extension image_url = images[i] @@ -59,7 +60,7 @@ class Erome: print(" {}".format(filename)) try: - getFile(filename, filename, folder_dir, image_url, indent=2) + self.getFile(filename, filename, folder_dir, image_url, indent=2) print() except FileAlreadyExistsError: print(" The file already exists" + " " * 10, end="\n\n") diff --git a/bulkredditdownloader/downloaders/gallery.py b/bulkredditdownloader/downloaders/gallery.py index e3ec461..597d653 100644 --- a/bulkredditdownloader/downloaders/gallery.py +++ b/bulkredditdownloader/downloaders/gallery.py @@ -1,25 +1,23 @@ import json import os +import pathlib import urllib import requests -import pathlib -from bulkredditdownloader.downloaders.downloader_utils import getFile -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, - TypeInSkip) +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, + NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print -class Gallery: +class Gallery(BaseDownloader): def __init__(self, directory: pathlib.Path, post): + super().__init__(directory, post) link = post['CONTENTURL'] self.raw_data = self.getData(link) - self.directory = directory - self.post = post - images = {} count = 0 for model in self.raw_data['posts']['models']: @@ -86,7 +84,7 @@ class Gallery: print("\n ({}/{})".format(i + 1, count)) try: - getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2) + self.getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2) how_many_downloaded += 1 print() diff --git a/bulkredditdownloader/downloaders/gfycat.py b/bulkredditdownloader/downloaders/gfycat.py index 7ab93b4..1bedeb4 100644 --- a/bulkredditdownloader/downloaders/gfycat.py +++ b/bulkredditdownloader/downloaders/gfycat.py @@ -4,22 +4,22 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL import pathlib - -class Gfycat: +class Gfycat(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - post['EXTENSION'] = getExtension(post['MEDIAURL']) + post['EXTENSION'] = self.getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) @@ -27,7 +27,7 @@ class Gfycat: filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['MEDIAURL']) + self.getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url: str) -> str: diff --git a/bulkredditdownloader/downloaders/gif_delivery_network.py b/bulkredditdownloader/downloaders/gif_delivery_network.py index 486bc9a..2c66e1b 100644 --- a/bulkredditdownloader/downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/downloaders/gif_delivery_network.py @@ -4,19 +4,20 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL -class GifDeliveryNetwork: +class GifDeliveryNetwork(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - post['EXTENSION'] = getExtension(post['MEDIAURL']) + post['EXTENSION'] = self.getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) @@ -24,7 +25,7 @@ class GifDeliveryNetwork: filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['MEDIAURL']) + self.getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url: str) -> str: diff --git a/bulkredditdownloader/downloaders/imgur.py b/bulkredditdownloader/downloaders/imgur.py index 9b444d0..3b816ff 100644 --- a/bulkredditdownloader/downloaders/imgur.py +++ b/bulkredditdownloader/downloaders/imgur.py @@ -4,19 +4,20 @@ import pathlib import requests +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.direct import Direct -from bulkredditdownloader.downloaders.downloader_utils import getFile -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, - NotADownloadableLinkError, TypeInSkip) +from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, + ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL, nameCorrector from bulkredditdownloader.utils import printToFile as print -class Imgur: +class Imgur(BaseDownloader): imgur_image_domain = "https://i.imgur.com/" def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) link = post['CONTENTURL'] if link.endswith(".gifv"): @@ -26,9 +27,6 @@ class Imgur: self.raw_data = self.getData(link) - self.directory = directory - self.post = post - if self.isAlbum: if self.raw_data["album_images"]["count"] != 1: self.downloadAlbum(self.raw_data["album_images"]) @@ -65,7 +63,7 @@ class Imgur: print("\n ({}/{})".format(i + 1, images_length)) try: - getFile(filename, short_filename, folder_dir, image_url, indent=2) + self.getFile(filename, short_filename, folder_dir, image_url, indent=2) how_many_downloaded += 1 print() @@ -101,7 +99,7 @@ class Imgur: filename = GLOBAL.config['filename'].format(**self.post) + extension short_filename = self.post['POSTID'] + extension - getFile(filename, short_filename, self.directory, image_url) + self.getFile(filename, short_filename, self.directory, image_url) @property def isAlbum(self) -> bool: diff --git a/bulkredditdownloader/downloaders/redgifs.py b/bulkredditdownloader/downloaders/redgifs.py index 6d504c5..ff63780 100644 --- a/bulkredditdownloader/downloaders/redgifs.py +++ b/bulkredditdownloader/downloaders/redgifs.py @@ -5,19 +5,20 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.downloader_utils import getExtension, getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL -class Redgifs: +class Redgifs(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) try: post['MEDIAURL'] = self.getLink(post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - post['EXTENSION'] = getExtension(post['MEDIAURL']) + post['EXTENSION'] = self.getExtension(post['MEDIAURL']) if not os.path.exists(directory): os.makedirs(directory) @@ -25,7 +26,7 @@ class Redgifs: filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] short_filename = post['POSTID'] + post['EXTENSION'] - getFile(filename, short_filename, directory, post['MEDIAURL']) + self.getFile(filename, short_filename, directory, post['MEDIAURL']) @staticmethod def getLink(url: str) -> str: diff --git a/bulkredditdownloader/downloaders/self_post.py b/bulkredditdownloader/downloaders/self_post.py index fa03e7b..05a7249 100644 --- a/bulkredditdownloader/downloaders/self_post.py +++ b/bulkredditdownloader/downloaders/self_post.py @@ -4,6 +4,7 @@ import os import pathlib from pathlib import Path +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print @@ -11,8 +12,9 @@ from bulkredditdownloader.utils import printToFile as print VanillaPrint = print -class SelfPost: +class SelfPost(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) if "self" in GLOBAL.arguments.skip: raise TypeInSkip diff --git a/bulkredditdownloader/downloaders/vreddit.py b/bulkredditdownloader/downloaders/vreddit.py index 74b776c..b530d0c 100644 --- a/bulkredditdownloader/downloaders/vreddit.py +++ b/bulkredditdownloader/downloaders/vreddit.py @@ -2,13 +2,14 @@ import os import pathlib import subprocess -from bulkredditdownloader.downloaders.downloader_utils import getFile +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print -class VReddit: +class VReddit(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) extension = ".mp4" if not os.path.exists(directory): os.makedirs(directory) @@ -20,7 +21,7 @@ class VReddit: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) except Exception: - getFile(filename, short_filename, directory, post['CONTENTURL']) + self.getFile(filename, short_filename, directory, post['CONTENTURL']) print("FFMPEG library not found, skipping merging video and audio") else: video_name = post['POSTID'] + "_video" @@ -30,8 +31,8 @@ class VReddit: print(directory, filename, sep="\n") - getFile(video_name, video_name, directory, video_url, silent=True) - getFile(audio_name, audio_name, directory, audio_url, silent=True) + self.getFile(video_name, video_name, directory, video_url, silent=True) + self.getFile(audio_name, audio_name, directory, audio_url, silent=True) try: self._mergeAudio(video_name, audio_name, filename, short_filename, directory) except KeyboardInterrupt: diff --git a/bulkredditdownloader/downloaders/youtube.py b/bulkredditdownloader/downloaders/youtube.py index c5abc81..bcc0c2f 100644 --- a/bulkredditdownloader/downloaders/youtube.py +++ b/bulkredditdownloader/downloaders/youtube.py @@ -4,15 +4,15 @@ import sys import youtube_dl -from bulkredditdownloader.downloaders.downloader_utils import createHash +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError from bulkredditdownloader.utils import GLOBAL from bulkredditdownloader.utils import printToFile as print - -class Youtube: +class Youtube(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): + super().__init__(directory, post) if not os.path.exists(directory): os.makedirs(directory) @@ -37,7 +37,7 @@ class Youtube: if GLOBAL.arguments.no_dupes: try: - file_hash = createHash(str(location)) + file_hash = self.createHash(str(location)) except FileNotFoundError: return None if file_hash in GLOBAL.downloadedPosts(): From 69e21e46a2e4f8cf5b404e3efddb2df9ac14f373 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 12:18:46 +1000 Subject: [PATCH 007/276] Embed function --- .../downloaders/base_downloader.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bulkredditdownloader/downloaders/base_downloader.py b/bulkredditdownloader/downloaders/base_downloader.py index 297e31c..5580d70 100644 --- a/bulkredditdownloader/downloaders/base_downloader.py +++ b/bulkredditdownloader/downloaders/base_downloader.py @@ -25,14 +25,6 @@ class BaseDownloader(ABC): hash_md5.update(chunk) return hash_md5.hexdigest() - @staticmethod - def dlProgress(count: int, block_size: int, total_size: int): - """Function for writing download progress to console """ - download_mbs = int(count * block_size * (10 ** (-6))) - file_size = int(total_size * (10 ** (-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) - sys.stdout.flush() - @staticmethod def getFile( filename: str, @@ -79,13 +71,20 @@ class BaseDownloader(ABC): if not silent: print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") + def dlProgress(count: int, block_size: int, total_size: int): + """Function for writing download progress to console """ + download_mbs = int(count * block_size * (10 ** (-6))) + file_size = int(total_size * (10 ** (-6))) + sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) + sys.stdout.flush() + for i in range(3): file_dir = Path(folder_dir) / filename temp_dir = Path(folder_dir) / (filename + ".tmp") if not (os.path.isfile(file_dir)): try: - urllib.request.urlretrieve(image_url, temp_dir, reporthook=BaseDownloader.dlProgress) + urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress) file_hash = BaseDownloader.createHash(temp_dir) if GLOBAL.arguments.no_dupes: From f573038a21129448aba5df969d4d7d71de79a70f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 14:46:20 +1000 Subject: [PATCH 008/276] Move to inheritance system for downloaders --- .../downloaders/base_downloader.py | 110 ++++++++---------- bulkredditdownloader/downloaders/direct.py | 15 +-- bulkredditdownloader/downloaders/erome.py | 69 ++++++----- bulkredditdownloader/downloaders/gallery.py | 59 +++++----- bulkredditdownloader/downloaders/gfycat.py | 35 ++---- .../downloaders/gif_delivery_network.py | 21 ++-- bulkredditdownloader/downloaders/imgur.py | 85 +++++++------- bulkredditdownloader/downloaders/redgifs.py | 26 ++--- bulkredditdownloader/downloaders/self_post.py | 37 +++--- bulkredditdownloader/downloaders/vreddit.py | 49 ++++---- bulkredditdownloader/downloaders/youtube.py | 27 +++-- 11 files changed, 253 insertions(+), 280 deletions(-) diff --git a/bulkredditdownloader/downloaders/base_downloader.py b/bulkredditdownloader/downloaders/base_downloader.py index 5580d70..eb30431 100644 --- a/bulkredditdownloader/downloaders/base_downloader.py +++ b/bulkredditdownloader/downloaders/base_downloader.py @@ -1,15 +1,18 @@ #!/usr/bin/env python3 # coding=utf-8 + import hashlib -import os -import sys -import urllib.request -from abc import ABC +import logging +import re +from abc import ABC, abstractmethod from pathlib import Path +import requests + from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print + +logger = logging.getLogger(__name__) class BaseDownloader(ABC): @@ -17,22 +20,17 @@ class BaseDownloader(ABC): self.directory = directory self.post = post + @abstractmethod + def download(self): + raise NotImplementedError + @staticmethod - def createHash(filename: str) -> str: - hash_md5 = hashlib.md5() - with open(filename, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) + def _create_hash(content: bytes) -> str: + hash_md5 = hashlib.md5(content) return hash_md5.hexdigest() @staticmethod - def getFile( - filename: str, - short_filename: str, - folder_dir: Path, - image_url: str, - indent: int = 0, - silent: bool = False): + def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False): formats = { "videos": [".mp4", ".webm"], "images": [".jpg", ".jpeg", ".png", ".bmp"], @@ -52,69 +50,55 @@ class BaseDownloader(ABC): ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " "Safari/537.36 OPR/54.0.2952.64"), - ("Accept", "text/html,application/xhtml+xml,application/xml;" - "q=0.9,image/webp,image/apng,*/*;q=0.8"), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"), ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), ("Accept-Encoding", "none"), ("Accept-Language", "en-US,en;q=0.8"), ("Connection", "keep-alive") ] - if not os.path.exists(folder_dir): - os.makedirs(folder_dir) + folder_dir.mkdir(exist_ok=True) - opener = urllib.request.build_opener() if "imgur" not in image_url: - opener.addheaders = headers - urllib.request.install_opener(opener) + addheaders = headers + else: + addheaders = None if not silent: - print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") - - def dlProgress(count: int, block_size: int, total_size: int): - """Function for writing download progress to console """ - download_mbs = int(count * block_size * (10 ** (-6))) - file_size = int(total_size * (10 ** (-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size)) - sys.stdout.flush() + logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") + # Loop to attempt download 3 times for i in range(3): - file_dir = Path(folder_dir) / filename - temp_dir = Path(folder_dir) / (filename + ".tmp") + file_path = Path(folder_dir) / filename - if not (os.path.isfile(file_dir)): + if file_path.is_file(): + raise FileAlreadyExistsError + else: try: - urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress) - - file_hash = BaseDownloader.createHash(temp_dir) - if GLOBAL.arguments.no_dupes: - if file_hash in GLOBAL.downloadedPosts(): - os.remove(temp_dir) - raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(file_hash) - - os.rename(temp_dir, file_dir) - if not silent: - print(" " * indent + "Downloaded" + " " * 10) - return None + download_content = requests.get(image_url, headers=addheaders).content except ConnectionResetError: raise FailedToDownload - except FileNotFoundError: - filename = short_filename - else: - raise FileAlreadyExistsError + + file_hash = BaseDownloader._create_hash(download_content) + if GLOBAL.arguments.no_dupes: + if file_hash in GLOBAL.downloadedPosts(): + raise FileAlreadyExistsError + GLOBAL.downloadedPosts.add(file_hash) + + with open(file_path, 'wb') as file: + file.write(download_content) + if not silent: + logger.info(" " * indent + "Downloaded" + " " * 10) + return + raise FailedToDownload @staticmethod - def getExtension(link: str): - """Extract file extension from image link. If didn't find any, return '.jpg' """ - image_types = ['jpg', 'png', 'mp4', 'webm', 'gif'] - parsed = link.split('.') - for fileType in image_types: - if fileType in parsed: - return "." + parsed[-1] + def _get_extension(url: str) -> str: + pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))') + if len(results := re.search(pattern, url).groups()) > 1: + return results[1] + if "v.redd.it" not in url: + return '.jpg' else: - if "v.redd.it" not in link: - return '.jpg' - else: - return '.mp4' + return '.mp4' diff --git a/bulkredditdownloader/downloaders/direct.py b/bulkredditdownloader/downloaders/direct.py index 23a5c2f..5fe97cd 100644 --- a/bulkredditdownloader/downloaders/direct.py +++ b/bulkredditdownloader/downloaders/direct.py @@ -1,4 +1,5 @@ -import os +#!/usr/bin/env python3 + import pathlib from bulkredditdownloader.downloaders.base_downloader import BaseDownloader @@ -8,11 +9,11 @@ from bulkredditdownloader.utils import GLOBAL class Direct(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) - post['EXTENSION'] = self.getExtension(post['CONTENTURL']) - if not os.path.exists(directory): - os.makedirs(directory) + self.download() - filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] - short_filename = post['POSTID'] + post['EXTENSION'] + def download(self): + self.post['EXTENSION'] = self._get_extension(self.post['CONTENTURL']) + self.directory.mkdir(exist_ok=True) - self.getFile(filename, short_filename, directory, post['CONTENTURL']) + filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] + self._download_resource(pathlib.Path(filename), self.directory, self.post['CONTENTURL']) diff --git a/bulkredditdownloader/downloaders/erome.py b/bulkredditdownloader/downloaders/erome.py index f54e6f5..2df5937 100644 --- a/bulkredditdownloader/downloaders/erome.py +++ b/bulkredditdownloader/downloaders/erome.py @@ -1,5 +1,8 @@ -import os +#!/usr/bin/env python3 + +import logging import pathlib +import re import urllib.error import urllib.request from html.parser import HTMLParser @@ -7,70 +10,64 @@ from html.parser import HTMLParser from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print + +logger = logging.getLogger(__name__) class Erome(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) + self.download() + + def download(self): try: - images = self.getLinks(post['CONTENTURL']) + images = self._get_links(self.post['CONTENTURL']) except urllib.error.HTTPError: raise NotADownloadableLinkError("Not a downloadable link") images_length = len(images) - how_many_downloaded = images_length + how_many_downloaded = len(images) duplicates = 0 if images_length == 1: - extension = self.getExtension(images[0]) - """Filenames are declared here""" - filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] - short_filename = post['POSTID'] + extension + filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] - image_url = images[0] - if 'https://' not in image_url or 'http://' not in image_url: - image_url = "https://" + image_url + image = images[0] + if not re.match(r'https?://.*', image): + image = "https://" + image - self.getFile(filename, short_filename, directory, image_url) + self._download_resource(filename, self.directory, image) else: - filename = GLOBAL.config['filename'].format(**post) - print(filename) + filename = GLOBAL.config['filename'].format(**self.post) + logger.info(filename) - folder_dir = directory / filename + folder_dir = self.directory / filename - try: - if not os.path.exists(folder_dir): - os.makedirs(folder_dir) - except FileNotFoundError: - folder_dir = directory / post['POSTID'] - os.makedirs(folder_dir) - - for i in range(images_length): - extension = self.getExtension(images[i]) + folder_dir.mkdir(exist_ok=True) + for i, image in enumerate(images): + extension = self._get_extension(image) filename = str(i + 1) + extension - image_url = images[i] - if 'https://' not in image_url and 'http://' not in image_url: - image_url = "https://" + image_url - print(" ({}/{})".format(i + 1, images_length)) - print(" {}".format(filename)) + if not re.match(r'https?://.*', image): + image = "https://" + image + + logger.info(" ({}/{})".format(i + 1, images_length)) + logger.info(" {}".format(filename)) try: - self.getFile(filename, filename, folder_dir, image_url, indent=2) - print() + self._download_resource(pathlib.Path(filename), folder_dir, image, indent=2) except FileAlreadyExistsError: - print(" The file already exists" + " " * 10, end="\n\n") + logger.info(" The file already exists" + " " * 10, end="\n\n") duplicates += 1 how_many_downloaded -= 1 except Exception as exception: # raise exception - print("\n Could not get the file") - print( + logger.error("\n Could not get the file") + logger.error( " " + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)) + "\n" @@ -82,10 +79,12 @@ class Erome(BaseDownloader): elif how_many_downloaded + duplicates < images_length: raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") - def getLinks(self, url: str) -> list[str]: + @staticmethod + def _get_links(url: str) -> list[str]: content = [] line_number = None + # TODO: move to bs4 and requests class EromeParser(HTMLParser): tag = None diff --git a/bulkredditdownloader/downloaders/gallery.py b/bulkredditdownloader/downloaders/gallery.py index 597d653..d877e4e 100644 --- a/bulkredditdownloader/downloaders/gallery.py +++ b/bulkredditdownloader/downloaders/gallery.py @@ -1,7 +1,9 @@ +#!/usr/bin/env python3 + import json -import os import pathlib -import urllib +import logging +import urllib.parse import requests @@ -9,15 +11,18 @@ from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print + +logger = logging.getLogger(__name__) class Gallery(BaseDownloader): def __init__(self, directory: pathlib.Path, post): super().__init__(directory, post) - link = post['CONTENTURL'] - self.raw_data = self.getData(link) + link = self.post['CONTENTURL'] + self.raw_data = self._get_data(link) + self.download() + def download(self): images = {} count = 0 for model in self.raw_data['posts']['models']: @@ -27,15 +32,15 @@ class Gallery(BaseDownloader): images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts'] ['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']} count += 1 - except Exception: + except KeyError: continue - except Exception: + except KeyError: continue - self.downloadAlbum(images, count) + self._download_album(images, count) @staticmethod - def getData(link: str) -> dict: + def _get_data(link: str) -> dict: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", @@ -58,50 +63,42 @@ class Gallery(BaseDownloader): data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1]) return data - def downloadAlbum(self, images: dict, count: int): + def _download_album(self, images: dict, count: int): folder_name = GLOBAL.config['filename'].format(**self.post) folder_dir = self.directory / folder_name how_many_downloaded = 0 duplicates = 0 - try: - if not os.path.exists(folder_dir): - os.makedirs(folder_dir) - except FileNotFoundError: - folder_dir = self.directory / self.post['POSTID'] - os.makedirs(folder_dir) + folder_dir.mkdir(exist_ok=True) + logger.info(folder_name) - print(folder_name) + for i, image in enumerate(images): + path = urllib.parse.urlparse(image['url']).path + extension = pathlib.Path(path).suffix - for i in range(count): - path = urllib.parse.urlparse(images[i]['url']).path - extension = os.path.splitext(path)[1] + filename = pathlib.Path("_".join([str(i + 1), image['id']]) + extension) - filename = "_".join([str(i + 1), images[i]['id']]) + extension - short_filename = str(i + 1) + "_" + images[i]['id'] - - print("\n ({}/{})".format(i + 1, count)) + logger.info("\n ({}/{})".format(i + 1, count)) try: - self.getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2) + self._download_resource(filename, folder_dir, image['url'], indent=2) how_many_downloaded += 1 - print() except FileAlreadyExistsError: - print(" The file already exists" + " " * 10, end="\n\n") + logger.info(" The file already exists" + " " * 10, end="\n\n") duplicates += 1 except TypeInSkip: - print(" Skipping...") + logger.info(" Skipping...") how_many_downloaded += 1 except Exception as exception: - print("\n Could not get the file") - print(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( + logger.info("\n Could not get the file") + logger.info(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( class_name=exception.__class__.__name__, info=str(exception)) + "\n" ) - print(GLOBAL.log_stream.getvalue(), no_print=True) + logger.info(GLOBAL.log_stream.getvalue(), no_print=True) if duplicates == count: raise FileAlreadyExistsError diff --git a/bulkredditdownloader/downloaders/gfycat.py b/bulkredditdownloader/downloaders/gfycat.py index 1bedeb4..9d2b3bb 100644 --- a/bulkredditdownloader/downloaders/gfycat.py +++ b/bulkredditdownloader/downloaders/gfycat.py @@ -1,43 +1,32 @@ +#!/usr/bin/env python3 + import json -import os +import pathlib +import re import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork -from bulkredditdownloader.errors import NotADownloadableLinkError -from bulkredditdownloader.utils import GLOBAL -import pathlib -class Gfycat(BaseDownloader): +class Gfycat(GifDeliveryNetwork): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) - try: - post['MEDIAURL'] = self.getLink(post['CONTENTURL']) - except IndexError: - raise NotADownloadableLinkError("Could not read the page source") + self.download() - post['EXTENSION'] = self.getExtension(post['MEDIAURL']) - - if not os.path.exists(directory): - os.makedirs(directory) - - filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] - short_filename = post['POSTID'] + post['EXTENSION'] - - self.getFile(filename, short_filename, directory, post['MEDIAURL']) + def download(self): + super().download() @staticmethod - def getLink(url: str) -> str: + def _get_link(url: str) -> str: """Extract direct link to the video from page's source and return it """ - if '.webm' in url or '.mp4' in url or '.gif' in url: + if re.match(r'\.(webm|mp4|gif)$', url): return url - if url[-1:] == '/': + if url.endswith('/'): url = url[:-1] url = "https://gfycat.com/" + url.split('/')[-1] @@ -49,6 +38,6 @@ class Gfycat(BaseDownloader): content = soup.find("script", attrs=attributes) if content is None: - return GifDeliveryNetwork.getLink(url) + return super()._get_link(url) return json.loads(content.contents[0])["video"]["contentUrl"] diff --git a/bulkredditdownloader/downloaders/gif_delivery_network.py b/bulkredditdownloader/downloaders/gif_delivery_network.py index 2c66e1b..52caf4c 100644 --- a/bulkredditdownloader/downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/downloaders/gif_delivery_network.py @@ -1,4 +1,5 @@ -import os +#!/usr/bin/env python3 + import pathlib import urllib.request @@ -12,23 +13,23 @@ from bulkredditdownloader.utils import GLOBAL class GifDeliveryNetwork(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) + self.download() + + def download(self): try: - post['MEDIAURL'] = self.getLink(post['CONTENTURL']) + self.post['MEDIAURL'] = self._get_link(self.post['CONTENTURL']) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - post['EXTENSION'] = self.getExtension(post['MEDIAURL']) + self.post['EXTENSION'] = self._get_extension(self.post['MEDIAURL']) + self.directory.mkdir(exist_ok=True) - if not os.path.exists(directory): - os.makedirs(directory) + filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] - filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] - short_filename = post['POSTID'] + post['EXTENSION'] - - self.getFile(filename, short_filename, directory, post['MEDIAURL']) + self._download_resource(filename, self.directory, self.post['MEDIAURL']) @staticmethod - def getLink(url: str) -> str: + def _get_link(url: str) -> str: """Extract direct link to the video from page's source and return it """ diff --git a/bulkredditdownloader/downloaders/imgur.py b/bulkredditdownloader/downloaders/imgur.py index 3b816ff..6f05b26 100644 --- a/bulkredditdownloader/downloaders/imgur.py +++ b/bulkredditdownloader/downloaders/imgur.py @@ -1,6 +1,8 @@ +#!/usr/bin/env python3 + import json -import os import pathlib +import logging import requests @@ -9,7 +11,8 @@ from bulkredditdownloader.downloaders.direct import Direct from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL, nameCorrector -from bulkredditdownloader.utils import printToFile as print + +logger = logging.getLogger(__name__) class Imgur(BaseDownloader): @@ -18,24 +21,28 @@ class Imgur(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) - link = post['CONTENTURL'] + self.raw_data = {} + self.download() + + def download(self): + link = self.post['CONTENTURL'] if link.endswith(".gifv"): link = link.replace(".gifv", ".mp4") - Direct(directory, {**post, 'CONTENTURL': link}) + Direct(self.directory, {**self.post, 'CONTENTURL': link}) return - self.raw_data = self.getData(link) + self.raw_data = self._get_data(link) - if self.isAlbum: + if self._is_album: if self.raw_data["album_images"]["count"] != 1: - self.downloadAlbum(self.raw_data["album_images"]) + self._download_album(self.raw_data["album_images"]) else: - self.download(self.raw_data["album_images"]["images"][0]) + self._download_image(self.raw_data["album_images"]["images"][0]) else: - self.download(self.raw_data) + self._download_image(self.raw_data) - def downloadAlbum(self, images: dict): + def _download_album(self, images: dict): folder_name = GLOBAL.config['filename'].format(**self.post) folder_dir = self.directory / folder_name @@ -43,70 +50,60 @@ class Imgur(BaseDownloader): how_many_downloaded = 0 duplicates = 0 - try: - if not os.path.exists(folder_dir): - os.makedirs(folder_dir) - except FileNotFoundError: - folder_dir = self.directory / self.post['POSTID'] - os.makedirs(folder_dir) - - print(folder_name) + folder_dir.mkdir(exist_ok=True) + logger.info(folder_name) for i in range(images_length): - extension = self.validateExtension(images["images"][i]["ext"]) + extension = self._validate_extension(images["images"][i]["ext"]) image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension - filename = "_".join([str(i + 1), - nameCorrector(images["images"][i]['title']), - images["images"][i]['hash']]) + extension - short_filename = str(i + 1) + "_" + images["images"][i]['hash'] + filename = pathlib.Path("_".join([str(i + 1), + nameCorrector(images["images"][i]['title']), + images["images"][i]['hash']]) + extension) - print("\n ({}/{})".format(i + 1, images_length)) + logger.info("\n ({}/{})".format(i + 1, images_length)) try: - self.getFile(filename, short_filename, folder_dir, image_url, indent=2) + self._download_resource(filename, folder_dir, image_url, indent=2) how_many_downloaded += 1 - print() except FileAlreadyExistsError: - print(" The file already exists" + " " * 10, end="\n\n") + logger.info(" The file already exists" + " " * 10, end="\n\n") duplicates += 1 except TypeInSkip: - print(" Skipping...") + logger.info(" Skipping...") how_many_downloaded += 1 except Exception as exception: - print("\n Could not get the file") - print( - " " + - "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( + logger.info("\n Could not get the file") + logger.info( + " " + + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( class_name=exception.__class__.__name__, info=str(exception) ) + "\n" ) - print(GLOBAL.log_stream.getvalue(), no_print=True) + logger.info(GLOBAL.log_stream.getvalue(), no_print=True) if duplicates == images_length: raise FileAlreadyExistsError elif how_many_downloaded + duplicates < images_length: raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") - def download(self, image: dict): - extension = self.validateExtension(image["ext"]) + def _download_image(self, image: dict): + extension = self._validate_extension(image["ext"]) image_url = self.imgur_image_domain + image["hash"] + extension filename = GLOBAL.config['filename'].format(**self.post) + extension - short_filename = self.post['POSTID'] + extension - self.getFile(filename, short_filename, self.directory, image_url) + self._download_resource(filename, self.directory, image_url) - @property - def isAlbum(self) -> bool: + def _is_album(self) -> bool: return "album_images" in self.raw_data @staticmethod - def getData(link: str) -> dict: + def _get_data(link: str) -> dict: cookies = {"over18": "1", "postpagebeta": "0"} res = requests.get(link, cookies=cookies) if res.status_code != 200: @@ -128,18 +125,18 @@ class Imgur(BaseDownloader): end_index -= 1 try: data = page_source[start_index:end_index + 2].strip()[:-1] - except Exception: + except IndexError: page_source[end_index + 1] = '}' data = page_source[start_index:end_index + 3].strip()[:-1] return json.loads(data) @staticmethod - def validateExtension(string: str) -> str: + def _validate_extension(extension_suffix: str) -> str: possible_extensions = [".jpg", ".png", ".mp4", ".gif"] for extension in possible_extensions: - if extension in string: + if extension in extension_suffix: return extension else: - raise ExtensionError(f"\"{string}\" is not recognized as a valid extension.") + raise ExtensionError(f"\"{extension_suffix}\" is not recognized as a valid extension.") diff --git a/bulkredditdownloader/downloaders/redgifs.py b/bulkredditdownloader/downloaders/redgifs.py index ff63780..98224aa 100644 --- a/bulkredditdownloader/downloaders/redgifs.py +++ b/bulkredditdownloader/downloaders/redgifs.py @@ -1,35 +1,25 @@ +#!/usr/bin/env python3 + import json -import os import pathlib import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.errors import NotADownloadableLinkError -from bulkredditdownloader.utils import GLOBAL -class Redgifs(BaseDownloader): +class Redgifs(GifDeliveryNetwork): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) - try: - post['MEDIAURL'] = self.getLink(post['CONTENTURL']) - except IndexError: - raise NotADownloadableLinkError("Could not read the page source") + self.download() - post['EXTENSION'] = self.getExtension(post['MEDIAURL']) - - if not os.path.exists(directory): - os.makedirs(directory) - - filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"] - short_filename = post['POSTID'] + post['EXTENSION'] - - self.getFile(filename, short_filename, directory, post['MEDIAURL']) + def download(self): + super().download() @staticmethod - def getLink(url: str) -> str: + def _get_link(url: str) -> str: """Extract direct link to the video from page's source and return it """ diff --git a/bulkredditdownloader/downloaders/self_post.py b/bulkredditdownloader/downloaders/self_post.py index 05a7249..2325711 100644 --- a/bulkredditdownloader/downloaders/self_post.py +++ b/bulkredditdownloader/downloaders/self_post.py @@ -1,45 +1,46 @@ -from src.utils import printToFile as print +#!/usr/bin/env python3 + import io -import os +import logging import pathlib from pathlib import Path from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print -VanillaPrint = print +logger = logging.getLogger(__name__) class SelfPost(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) + self.download() + + def download(self): if "self" in GLOBAL.arguments.skip: raise TypeInSkip - if not os.path.exists(directory): - os.makedirs(directory) + self.directory.mkdir(exist_ok=True) + filename = GLOBAL.config['filename'].format(**self.post) - filename = GLOBAL.config['filename'].format(**post) - - file_dir = directory / (filename + ".md") - print(file_dir) - print(filename + ".md") + file_dir = self.directory / (filename + ".md") + logger.info(file_dir) + logger.info(filename + ".md") if Path.is_file(file_dir): raise FileAlreadyExistsError try: - self.writeToFile(file_dir, post) + self._write_to_file(file_dir, self.post) except FileNotFoundError: - file_dir = post['POSTID'] + ".md" - file_dir = directory / file_dir + file_dir = self.post['POSTID'] + ".md" + file_dir = self.directory / file_dir - self.writeToFile(file_dir, post) + self._write_to_file(file_dir, self.post) @staticmethod - def writeToFile(directory: pathlib.Path, post: dict): + def _write_to_file(directory: pathlib.Path, post: dict): """Self posts are formatted here""" content = ("## [" + post["TITLE"] @@ -59,5 +60,5 @@ class SelfPost(BaseDownloader): + ")") with io.open(directory, "w", encoding="utf-8") as FILE: - VanillaPrint(content, file=FILE) - print("Downloaded") + print(content, file=FILE) + logger.info("Downloaded") diff --git a/bulkredditdownloader/downloaders/vreddit.py b/bulkredditdownloader/downloaders/vreddit.py index b530d0c..3cce613 100644 --- a/bulkredditdownloader/downloaders/vreddit.py +++ b/bulkredditdownloader/downloaders/vreddit.py @@ -1,51 +1,56 @@ +#!/usr/bin/env python3 + +import logging import os import pathlib import subprocess from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print + +logger = logging.getLogger(__name__) class VReddit(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) - extension = ".mp4" - if not os.path.exists(directory): - os.makedirs(directory) + self.download() - filename = GLOBAL.config['filename'].format(**post) + extension - short_filename = post['POSTID'] + extension + def download(self): + extension = ".mp4" + self.directory.mkdir(exist_ok=True) + + filename = GLOBAL.config['filename'].format(**self.post) + extension try: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) except Exception: - self.getFile(filename, short_filename, directory, post['CONTENTURL']) - print("FFMPEG library not found, skipping merging video and audio") + self._download_resource(filename, self.directory, self.post['CONTENTURL']) + logger.info("FFMPEG library not found, skipping merging video and audio") else: - video_name = post['POSTID'] + "_video" - video_url = post['CONTENTURL'] - audio_name = post['POSTID'] + "_audio" + video_name = self.post['POSTID'] + "_video" + video_url = self.post['CONTENTURL'] + audio_name = self.post['POSTID'] + "_audio" audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' - print(directory, filename, sep="\n") + logger.info(self.directory, filename, sep="\n") - self.getFile(video_name, video_name, directory, video_url, silent=True) - self.getFile(audio_name, audio_name, directory, audio_url, silent=True) + self._download_resource(video_name, self.directory, video_url, silent=True) + self._download_resource(audio_name, self.directory, audio_url, silent=True) try: - self._mergeAudio(video_name, audio_name, filename, short_filename, directory) + self._merge_audio(video_name, audio_name, filename, self.directory) except KeyboardInterrupt: - os.remove(directory / filename) - os.remove(directory / audio_name) - os.rename(directory / video_name, directory / filename) + (self.directory / filename).unlink() + (self.directory / audio_name).unlink() + (self.directory / video_name).unlink() + (self.directory / filename).unlink() @staticmethod - def _mergeAudio( + def _merge_audio( video: pathlib.Path, audio: pathlib.Path, filename: pathlib.Path, - short_filename, directory: pathlib.Path): input_video = str(directory / video) input_audio = str(directory / audio) @@ -55,5 +60,5 @@ class VReddit(BaseDownloader): input_audio, input_video, str(directory / filename)) subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT) - os.remove(directory / video) - os.remove(directory / audio) + (directory / video).unlink() + (directory / audio).unlink() diff --git a/bulkredditdownloader/downloaders/youtube.py b/bulkredditdownloader/downloaders/youtube.py index bcc0c2f..abde54a 100644 --- a/bulkredditdownloader/downloaders/youtube.py +++ b/bulkredditdownloader/downloaders/youtube.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 + +import logging import os import pathlib import sys @@ -7,21 +10,24 @@ import youtube_dl from bulkredditdownloader.downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError from bulkredditdownloader.utils import GLOBAL -from bulkredditdownloader.utils import printToFile as print + +logger = logging.getLogger(__name__) class Youtube(BaseDownloader): def __init__(self, directory: pathlib.Path, post: dict): super().__init__(directory, post) - if not os.path.exists(directory): - os.makedirs(directory) + self.download() - filename = GLOBAL.config['filename'].format(**post) - print(filename) + def download(self): + self.directory.mkdir(exist_ok=True) - self.download(filename, directory, post['CONTENTURL']) + filename = GLOBAL.config['filename'].format(**self.post) + logger.info(filename) - def download(self, filename: str, directory: pathlib.Path, url: str): + self._download_video(filename, self.directory, self.post['CONTENTURL']) + + def _download_video(self, filename: str, directory: pathlib.Path, url: str): ydl_opts = { "format": "best", "outtmpl": str(directory / (filename + ".%(ext)s")), @@ -35,9 +41,12 @@ class Youtube(BaseDownloader): location = directory / (filename + ".mp4") + with open(location, 'rb') as file: + content = file.read() + if GLOBAL.arguments.no_dupes: try: - file_hash = self.createHash(str(location)) + file_hash = self._create_hash(content) except FileNotFoundError: return None if file_hash in GLOBAL.downloadedPosts(): @@ -48,7 +57,7 @@ class Youtube(BaseDownloader): @staticmethod def _hook(d): if d['status'] == 'finished': - return print("Downloaded") + return logger.info("Downloaded") downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6))) file_size = int(d['total_bytes'] * (10**(-6))) sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size)) From 5ef58f147f40aa81327cde928d780eaa9374ab7b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 16:38:30 +1000 Subject: [PATCH 009/276] Add some tests for base_download --- .../downloaders/base_downloader.py | 5 ++- bulkredditdownloader/tests/__init__.py | 0 .../tests/downloaders/__init__.py | 0 .../tests/downloaders/test_base_downloader.py | 42 +++++++++++++++++++ 4 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 bulkredditdownloader/tests/__init__.py create mode 100644 bulkredditdownloader/tests/downloaders/__init__.py create mode 100644 bulkredditdownloader/tests/downloaders/test_base_downloader.py diff --git a/bulkredditdownloader/downloaders/base_downloader.py b/bulkredditdownloader/downloaders/base_downloader.py index eb30431..7873db7 100644 --- a/bulkredditdownloader/downloaders/base_downloader.py +++ b/bulkredditdownloader/downloaders/base_downloader.py @@ -96,8 +96,9 @@ class BaseDownloader(ABC): @staticmethod def _get_extension(url: str) -> str: pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))') - if len(results := re.search(pattern, url).groups()) > 1: - return results[1] + if results := re.search(pattern, url): + if len(results.groups()) > 1: + return results[0] if "v.redd.it" not in url: return '.jpg' else: diff --git a/bulkredditdownloader/tests/__init__.py b/bulkredditdownloader/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bulkredditdownloader/tests/downloaders/__init__.py b/bulkredditdownloader/tests/downloaders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bulkredditdownloader/tests/downloaders/test_base_downloader.py b/bulkredditdownloader/tests/downloaders/test_base_downloader.py new file mode 100644 index 0000000..a87d426 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_base_downloader.py @@ -0,0 +1,42 @@ +#!/uasr/bin/env python3 +# coding=utf-8 + +from pathlib import Path + +import pytest + +from bulkredditdownloader.downloaders.base_downloader import BaseDownloader + + +@pytest.mark.parametrize(('test_bytes', 'expected'), ((b'test', '098f6bcd4621d373cade4e832627b4f6'), + (b'test2', 'ad0234829205b9033196ba818f7a872b'))) +def test_create_hash(test_bytes: bytes, expected: str): + result = BaseDownloader._create_hash(test_bytes) + assert result == expected + + +@pytest.mark.parametrize(('test_url', 'expected'), (('test.png', '.png'), + ('random.jpg', '.jpg'), + ('http://random.com/test.png', '.png'), + ('https://example.net/picture.jpg', '.jpg'), + ('https://v.redd.it/picture', '.mp4'), + ('https://v.redd.it/picture.jpg', '.jpg'), + ('https:/random.url', '.jpg') + )) +def test_get_extension(test_url: str, expected: str): + result = BaseDownloader._get_extension(test_url) + assert result == expected + + +@pytest.mark.skip +@pytest.mark.parametrize(('test_url', 'expected_hash'), (('https://www.iana.org/_img/2013.1/iana-logo-header.svg', ''), + ('', '') + )) +def test_download_resource(test_url: str, expected_hash: str, tmp_path: Path): + test_file = tmp_path / 'test' + BaseDownloader._download_resource(test_file, tmp_path, test_url) + assert test_file.exists() + with open(test_file, 'rb') as file: + content = file.read() + hash_result = BaseDownloader._create_hash(content) + assert hash_result == expected_hash From f2415b6bd061fd3cbd3cd9f122341439952af356 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 17:08:24 +1000 Subject: [PATCH 010/276] Rename folder --- bulkredditdownloader/__main__.py | 20 +++++++++---------- .../__init__.py | 0 .../base_downloader.py | 0 .../direct.py | 2 +- .../erome.py | 2 +- .../gallery.py | 2 +- .../gfycat.py | 2 +- .../gif_delivery_network.py | 2 +- .../imgur.py | 4 ++-- .../redgifs.py | 2 +- .../self_post.py | 2 +- .../vreddit.py | 2 +- .../youtube.py | 2 +- .../tests/downloaders/test_base_downloader.py | 2 +- 14 files changed, 22 insertions(+), 22 deletions(-) rename bulkredditdownloader/{downloaders => site_downloaders}/__init__.py (100%) rename bulkredditdownloader/{downloaders => site_downloaders}/base_downloader.py (100%) rename bulkredditdownloader/{downloaders => site_downloaders}/direct.py (87%) rename bulkredditdownloader/{downloaders => site_downloaders}/erome.py (98%) rename bulkredditdownloader/{downloaders => site_downloaders}/gallery.py (98%) rename bulkredditdownloader/{downloaders => site_downloaders}/gfycat.py (92%) rename bulkredditdownloader/{downloaders => site_downloaders}/gif_delivery_network.py (95%) rename bulkredditdownloader/{downloaders => site_downloaders}/imgur.py (97%) rename bulkredditdownloader/{downloaders => site_downloaders}/redgifs.py (94%) rename bulkredditdownloader/{downloaders => site_downloaders}/self_post.py (95%) rename bulkredditdownloader/{downloaders => site_downloaders}/vreddit.py (96%) rename bulkredditdownloader/{downloaders => site_downloaders}/youtube.py (95%) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 773f719..6fd914b 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -14,16 +14,16 @@ from prawcore.exceptions import InsufficientScope from bulkredditdownloader.arguments import Arguments from bulkredditdownloader.config import Config -from bulkredditdownloader.downloaders.direct import Direct -from bulkredditdownloader.downloaders.erome import Erome -from bulkredditdownloader.downloaders.gallery import Gallery -from bulkredditdownloader.downloaders.gfycat import Gfycat -from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork -from bulkredditdownloader.downloaders.imgur import Imgur -from bulkredditdownloader.downloaders.redgifs import Redgifs -from bulkredditdownloader.downloaders.self_post import SelfPost -from bulkredditdownloader.downloaders.vreddit import VReddit -from bulkredditdownloader.downloaders.youtube import Youtube +from bulkredditdownloader.site_downloaders.direct import Direct +from bulkredditdownloader.site_downloaders.erome import Erome +from bulkredditdownloader.site_downloaders.gallery import Gallery +from bulkredditdownloader.site_downloaders.gfycat import Gfycat +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bulkredditdownloader.site_downloaders.imgur import Imgur +from bulkredditdownloader.site_downloaders.redgifs import Redgifs +from bulkredditdownloader.site_downloaders.self_post import SelfPost +from bulkredditdownloader.site_downloaders.vreddit import VReddit +from bulkredditdownloader.site_downloaders.youtube import Youtube from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, DomainInSkip, FailedToDownload, FileAlreadyExistsError, ImgurLimitError, ImgurLoginError, InvalidJSONFile, NoSuitablePost, NotADownloadableLinkError, TypeInSkip, full_exc_info) diff --git a/bulkredditdownloader/downloaders/__init__.py b/bulkredditdownloader/site_downloaders/__init__.py similarity index 100% rename from bulkredditdownloader/downloaders/__init__.py rename to bulkredditdownloader/site_downloaders/__init__.py diff --git a/bulkredditdownloader/downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py similarity index 100% rename from bulkredditdownloader/downloaders/base_downloader.py rename to bulkredditdownloader/site_downloaders/base_downloader.py diff --git a/bulkredditdownloader/downloaders/direct.py b/bulkredditdownloader/site_downloaders/direct.py similarity index 87% rename from bulkredditdownloader/downloaders/direct.py rename to bulkredditdownloader/site_downloaders/direct.py index 5fe97cd..95ac00f 100644 --- a/bulkredditdownloader/downloaders/direct.py +++ b/bulkredditdownloader/site_downloaders/direct.py @@ -2,7 +2,7 @@ import pathlib -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py similarity index 98% rename from bulkredditdownloader/downloaders/erome.py rename to bulkredditdownloader/site_downloaders/erome.py index 2df5937..540733f 100644 --- a/bulkredditdownloader/downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -7,7 +7,7 @@ import urllib.error import urllib.request from html.parser import HTMLParser -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py similarity index 98% rename from bulkredditdownloader/downloaders/gallery.py rename to bulkredditdownloader/site_downloaders/gallery.py index d877e4e..59334be 100644 --- a/bulkredditdownloader/downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -7,7 +7,7 @@ import urllib.parse import requests -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py similarity index 92% rename from bulkredditdownloader/downloaders/gfycat.py rename to bulkredditdownloader/site_downloaders/gfycat.py index 9d2b3bb..bd1d694 100644 --- a/bulkredditdownloader/downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -7,7 +7,7 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork class Gfycat(GifDeliveryNetwork): diff --git a/bulkredditdownloader/downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py similarity index 95% rename from bulkredditdownloader/downloaders/gif_delivery_network.py rename to bulkredditdownloader/site_downloaders/gif_delivery_network.py index 52caf4c..85252cb 100644 --- a/bulkredditdownloader/downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -5,7 +5,7 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py similarity index 97% rename from bulkredditdownloader/downloaders/imgur.py rename to bulkredditdownloader/site_downloaders/imgur.py index 6f05b26..b1c2016 100644 --- a/bulkredditdownloader/downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -6,8 +6,8 @@ import logging import requests -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.downloaders.direct import Direct +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.direct import Direct from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError, TypeInSkip) from bulkredditdownloader.utils import GLOBAL, nameCorrector diff --git a/bulkredditdownloader/downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py similarity index 94% rename from bulkredditdownloader/downloaders/redgifs.py rename to bulkredditdownloader/site_downloaders/redgifs.py index 98224aa..2f5f520 100644 --- a/bulkredditdownloader/downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -6,7 +6,7 @@ import urllib.request from bs4 import BeautifulSoup -from bulkredditdownloader.downloaders.gif_delivery_network import GifDeliveryNetwork +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.errors import NotADownloadableLinkError diff --git a/bulkredditdownloader/downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py similarity index 95% rename from bulkredditdownloader/downloaders/self_post.py rename to bulkredditdownloader/site_downloaders/self_post.py index 2325711..c94df7e 100644 --- a/bulkredditdownloader/downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -5,7 +5,7 @@ import logging import pathlib from pathlib import Path -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py similarity index 96% rename from bulkredditdownloader/downloaders/vreddit.py rename to bulkredditdownloader/site_downloaders/vreddit.py index 3cce613..2b4ee03 100644 --- a/bulkredditdownloader/downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -5,7 +5,7 @@ import os import pathlib import subprocess -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py similarity index 95% rename from bulkredditdownloader/downloaders/youtube.py rename to bulkredditdownloader/site_downloaders/youtube.py index abde54a..afabf66 100644 --- a/bulkredditdownloader/downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -7,7 +7,7 @@ import sys import youtube_dl -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import FileAlreadyExistsError from bulkredditdownloader.utils import GLOBAL diff --git a/bulkredditdownloader/tests/downloaders/test_base_downloader.py b/bulkredditdownloader/tests/downloaders/test_base_downloader.py index a87d426..951ef81 100644 --- a/bulkredditdownloader/tests/downloaders/test_base_downloader.py +++ b/bulkredditdownloader/tests/downloaders/test_base_downloader.py @@ -5,7 +5,7 @@ from pathlib import Path import pytest -from bulkredditdownloader.downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @pytest.mark.parametrize(('test_bytes', 'expected'), ((b'test', '098f6bcd4621d373cade4e832627b4f6'), From b1f0632a80726f0a22a5a8e7daee0b803ae37936 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 7 Feb 2021 20:23:08 +1000 Subject: [PATCH 011/276] Add download filter class --- bulkredditdownloader/download_filter.py | 39 ++++++++++++++ .../tests/test_download_filter.py | 54 +++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 bulkredditdownloader/download_filter.py create mode 100644 bulkredditdownloader/tests/test_download_filter.py diff --git a/bulkredditdownloader/download_filter.py b/bulkredditdownloader/download_filter.py new file mode 100644 index 0000000..806fd0d --- /dev/null +++ b/bulkredditdownloader/download_filter.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import re + + +class DownloadFilter: + def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None): + self.excluded_extensions = excluded_extensions + self.excluded_domains = excluded_domains + + def check_url(self, url: str) -> bool: + """Return whether a URL is allowed or not""" + if not self._check_extension(url): + return False + elif not self._check_domain(url): + return False + else: + return True + + def _check_extension(self, url: str) -> bool: + if not self.excluded_extensions: + return True + combined_extensions = '|'.join(self.excluded_extensions) + pattern = re.compile(r'.*({})$'.format(combined_extensions)) + if re.match(pattern, url): + return False + else: + return True + + def _check_domain(self, url: str) -> bool: + if not self.excluded_domains: + return True + combined_domains = '|'.join(self.excluded_domains) + pattern = re.compile(r'https?://.*({}).*'.format(combined_domains)) + if re.match(pattern, url): + return False + else: + return True diff --git a/bulkredditdownloader/tests/test_download_filter.py b/bulkredditdownloader/tests/test_download_filter.py new file mode 100644 index 0000000..c8957a5 --- /dev/null +++ b/bulkredditdownloader/tests/test_download_filter.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import pytest + +from bulkredditdownloader.download_filter import DownloadFilter + + +@pytest.fixture() +def download_filter() -> DownloadFilter: + return DownloadFilter(['mp4', 'mp3'], ['test.com', 'reddit.com']) + + +@pytest.mark.parametrize(('test_url', 'expected'), (('test.mp4', False), + ('test.avi', True), + ('test.random.mp3', False) + )) +def test_filter_extension(test_url: str, expected: bool, download_filter: DownloadFilter): + result = download_filter._check_extension(test_url) + assert result == expected + + +@pytest.mark.parametrize(('test_url', 'expected'), (('test.mp4', True), + ('http://reddit.com/test.mp4', False), + ('http://reddit.com/test.gif', False), + ('https://www.example.com/test.mp4', True), + ('https://www.example.com/test.png', True), + )) +def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadFilter): + result = download_filter._check_domain(test_url) + assert result == expected + + +@pytest.mark.parametrize(('test_url', 'expected'), (('test.mp4', False), + ('test.gif', True), + ('https://www.example.com/test.mp4', False), + ('https://www.example.com/test.png', True), + ('http://reddit.com/test.mp4', False), + ('http://reddit.com/test.gif', False), + )) +def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter): + result = download_filter.check_url(test_url) + assert result == expected + + +@pytest.mark.parametrize('test_url', ('test.mp3', + 'test.mp4', + 'http://reddit.com/test.mp4', + 't', + )) +def test_filter_empty_filter(test_url: str): + download_filter = DownloadFilter() + result = download_filter.check_url(test_url) + assert result is True From cd2c511db2dc4a9272a40e20f32697b7c4bdaf90 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 09:08:22 +1000 Subject: [PATCH 012/276] Add default config file --- default_config.cfg | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 default_config.cfg diff --git a/default_config.cfg b/default_config.cfg new file mode 100644 index 0000000..0b1b606 --- /dev/null +++ b/default_config.cfg @@ -0,0 +1,3 @@ +[DEFAULT] +client_id = U-6gk4ZCh3IeNQ +client_secret = 7CZHY6AmKweZME5s50SfDGylaPg \ No newline at end of file From 358357590fc1cfd5777e94629bb9f4dd2b76af9d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 09:08:47 +1000 Subject: [PATCH 013/276] Add file name formatter class --- bulkredditdownloader/file_name_formatter.py | 40 ++++++++++ .../tests/test_file_name_formatter.py | 74 +++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 bulkredditdownloader/file_name_formatter.py create mode 100644 bulkredditdownloader/tests/test_file_name_formatter.py diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py new file mode 100644 index 0000000..461947e --- /dev/null +++ b/bulkredditdownloader/file_name_formatter.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import re +from pathlib import Path + +import praw.models + +from bulkredditdownloader.resource import Resource + + +class FileNameFormatter: + def __init__(self, file_format_string: str, directory_format_string: str): + self.file_format_string = file_format_string + self.directory_format_string = directory_format_string + + @staticmethod + def _format_name(submission: praw.models.Submission, format_string: str) -> str: + submission_attributes = { + 'title': submission.title, + 'subreddit': submission.subreddit.display_name, + 'redditor': submission.author.name, + 'postid': submission.id, + 'upvotes': submission.score, + 'flair': submission.link_flair_text, + 'date': submission.created_utc + } + result = format_string + for key in submission_attributes.keys(): + if re.search(r'(?i).*{{{}}}.*'.format(key), result): + result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result) + + result = result.replace('/', '') + return result + + def format_path(self, resource: Resource, destination_directory: Path) -> Path: + subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) + file_path = subfolder / (str(self._format_name(resource.source_submission, + self.file_format_string)) + resource.extension) + return file_path diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py new file mode 100644 index 0000000..94a6245 --- /dev/null +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from pathlib import Path +from unittest.mock import Mock + +import praw.models +import pytest + +from bulkredditdownloader.file_name_formatter import FileNameFormatter +from bulkredditdownloader.resource import Resource + + +@pytest.fixture() +def submission() -> Mock: + test = Mock() + test.title = 'name' + test.subreddit.display_name = 'randomreddit' + test.author.name = 'person' + test.id = '12345' + test.score = 1000 + test.link_flair_text = 'test_flair' + test.created_utc = 123456789 + return test + + +@pytest.fixture() +def reddit_submission() -> praw.models.Submission: + rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test') + return rd.submission(id='lgilgt') + + +@pytest.mark.parametrize(('format_string', 'expected'), (('{SUBREDDIT}', 'randomreddit'), + ('{REDDITOR}', 'person'), + ('{POSTID}', '12345'), + ('{UPVOTES}', '1000'), + ('{FLAIR}', 'test_flair'), + ('{DATE}', '123456789'), + ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345') + )) +def test_format_name_mock(format_string: str, expected: str, submission: Mock): + result = FileNameFormatter._format_name(submission, format_string) + assert result == expected + + +@pytest.mark.parametrize(('format_string', 'expected'), + (('{SUBREDDIT}', 'Mindustry'), + ('{REDDITOR}', 'Gamer_player_boi'), + ('{POSTID}', 'lgilgt'), + ('{FLAIR}', 'Art'), + ('{SUBREDDIT}_{TITLE}', 'Mindustry_Toxopid that is NOT humane >:('), + ('{REDDITOR}_{TITLE}_{POSTID}', 'Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt') + )) +def test_format_name_real(format_string: str, expected: str, reddit_submission: praw.models.Submission): + result = FileNameFormatter._format_name(reddit_submission, format_string) + assert result == expected + + +@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'expected'), + (('{SUBREDDIT}', '{POSTID}', 'test/Mindustry/lgilgt.png'), + ('{SUBREDDIT}', '{TITLE}_{POSTID}', + 'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt.png'), + ('{SUBREDDIT}', '{REDDITOR}_{TITLE}_{POSTID}', + 'test/Mindustry/Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt.png') + )) +def test_format_full( + format_string_directory: str, + format_string_file: str, + expected: str, + reddit_submission: praw.models.Submission): + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', b'') + test_formatter = FileNameFormatter(format_string_file, format_string_directory) + result = test_formatter.format_path(test_resource, Path('test')) + assert str(result) == expected From 81293db8e9050e205bd369189635aa0c9ee1cc5a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 09:09:37 +1000 Subject: [PATCH 014/276] Add downloader factory --- .../site_downloaders/download_factory.py | 31 +++++++++++++++++++ .../downloaders/test_download_factory.py | 25 +++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 bulkredditdownloader/site_downloaders/download_factory.py create mode 100644 bulkredditdownloader/tests/downloaders/test_download_factory.py diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py new file mode 100644 index 0000000..64ebc1b --- /dev/null +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import re +from typing import Type + +from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.direct import Direct +from bulkredditdownloader.site_downloaders.erome import Erome +from bulkredditdownloader.site_downloaders.gfycat import Gfycat +from bulkredditdownloader.site_downloaders.imgur import Imgur +from bulkredditdownloader.site_downloaders.redgifs import Redgifs + + +class DownloadFactory: + @staticmethod + def pull_lever(url: str) -> Type[BaseDownloader]: + url_beginning = r'\s*(https?://(www.)?)' + if re.match(url_beginning + r'gfycat.com.*', url): + return Gfycat + elif re.match(url_beginning + r'erome.com.*', url): + return Erome + elif re.match(url_beginning + r'imgur.*', url): + return Imgur + elif re.match(url_beginning + r'redgifs.com', url): + return Redgifs + elif re.match(url_beginning + r'[vi].redd\.it.*', url): + return Direct + else: + raise NotADownloadableLinkError('No downloader module exists for url {}'.format(url)) diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py new file mode 100644 index 0000000..613296a --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import pytest + +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory +from bulkredditdownloader.site_downloaders.erome import Erome +from bulkredditdownloader.site_downloaders.gfycat import Gfycat +from bulkredditdownloader.site_downloaders.imgur import Imgur +from bulkredditdownloader.site_downloaders.redgifs import Redgifs + + +@pytest.mark.parametrize('test_url', ('https://gfycat.com/joyfulpitifulirishterrier', + 'https://gfycat.com/blaringaridjellyfish-jensen-ackles-supernatural')) +def test_factory_gfycat(test_url: str): + result = DownloadFactory.pull_lever(test_url) + assert result is Gfycat + + +@pytest.mark.parametrize('test_url', ('https://www.erome.com/a/bbezvaBn', + 'https://www.erome.com/a/p14JFlnm')) +def test_factory_erome(test_url): + result = DownloadFactory.pull_lever(test_url) + assert result is Erome From c20fab25947176c9a456cfac9ba09ad90f5b7cfe Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 09:09:49 +1000 Subject: [PATCH 015/276] Add resource class --- bulkredditdownloader/resource.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 bulkredditdownloader/resource.py diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py new file mode 100644 index 0000000..cf0ed90 --- /dev/null +++ b/bulkredditdownloader/resource.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import hashlib +import re + +from praw.models import Submission + + +class Resource: + def __init__(self, source_submission: Submission, url: str, content: bytes): + self.source_submission = source_submission + self.content = content + self.url = url + self.hash = hashlib.md5(content) + self.extension = self._get_extension(url) + + @staticmethod + def _get_extension(url: str) -> str: + pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))') + if results := re.search(pattern, url): + if len(results.groups()) > 1: + return results[0] + if "v.redd.it" not in url: + return '.jpg' + else: + return '.mp4' From a72abd6603d443d3b6af47c3a6bc95bbd84ef70c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 09:10:13 +1000 Subject: [PATCH 016/276] Update requirements with appdirs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 1a68e57..440a0a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +appdirs bs4 requests praw From a7f1db14e5058f9c46e5fa15197a6bcf684ef46e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 09:10:40 +1000 Subject: [PATCH 017/276] Move to different program structure --- bulkredditdownloader/__main__.py | 468 ++++++------------ bulkredditdownloader/arguments.py | 153 ------ bulkredditdownloader/config.py | 109 ---- bulkredditdownloader/downloader.py | 184 +++++++ bulkredditdownloader/errors.py | 125 +---- bulkredditdownloader/parser.py | 234 --------- bulkredditdownloader/program_mode.py | 241 --------- bulkredditdownloader/reddit.py | 91 ---- bulkredditdownloader/searcher.py | 341 ------------- .../site_downloaders/base_downloader.py | 101 +--- .../site_downloaders/direct.py | 12 +- .../site_downloaders/erome.py | 58 +-- .../site_downloaders/gallery.py | 58 +-- .../site_downloaders/gfycat.py | 4 +- .../site_downloaders/gif_delivery_network.py | 16 +- .../site_downloaders/imgur.py | 76 +-- .../site_downloaders/redgifs.py | 10 +- .../site_downloaders/self_post.py | 53 +- .../site_downloaders/vreddit.py | 60 +-- .../site_downloaders/youtube.py | 65 +-- bulkredditdownloader/store.py | 2 + .../tests/downloaders/test_base_downloader.py | 50 +- bulkredditdownloader/utils.py | 90 ---- setup.py | 36 +- 24 files changed, 504 insertions(+), 2133 deletions(-) delete mode 100644 bulkredditdownloader/arguments.py delete mode 100644 bulkredditdownloader/config.py create mode 100644 bulkredditdownloader/downloader.py delete mode 100644 bulkredditdownloader/parser.py delete mode 100644 bulkredditdownloader/program_mode.py delete mode 100644 bulkredditdownloader/reddit.py delete mode 100644 bulkredditdownloader/searcher.py delete mode 100644 bulkredditdownloader/utils.py diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 6fd914b..77c0088 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -4,338 +4,158 @@ This program downloads imgur, gfycat and direct image and video links of saved posts from a reddit account. It is written in Python 3. """ + +import argparse import logging -import os import sys -import time -from io import StringIO -from pathlib import Path -from prawcore.exceptions import InsufficientScope -from bulkredditdownloader.arguments import Arguments -from bulkredditdownloader.config import Config -from bulkredditdownloader.site_downloaders.direct import Direct -from bulkredditdownloader.site_downloaders.erome import Erome -from bulkredditdownloader.site_downloaders.gallery import Gallery -from bulkredditdownloader.site_downloaders.gfycat import Gfycat -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork -from bulkredditdownloader.site_downloaders.imgur import Imgur -from bulkredditdownloader.site_downloaders.redgifs import Redgifs -from bulkredditdownloader.site_downloaders.self_post import SelfPost -from bulkredditdownloader.site_downloaders.vreddit import VReddit -from bulkredditdownloader.site_downloaders.youtube import Youtube -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, DomainInSkip, FailedToDownload, FileAlreadyExistsError, - ImgurLimitError, ImgurLoginError, InvalidJSONFile, NoSuitablePost, NotADownloadableLinkError, - TypeInSkip, full_exc_info) -from bulkredditdownloader.json_helper import JsonFile -from bulkredditdownloader.program_mode import ProgramMode -from bulkredditdownloader.reddit import Reddit -from bulkredditdownloader.searcher import getPosts -from bulkredditdownloader.store import Store -from bulkredditdownloader.utils import GLOBAL, createLogFile, nameCorrector, printToFile +from bulkredditdownloader.downloader import RedditDownloader +from bulkredditdownloader.errors import BulkDownloaderException -from time import sleep - -__author__ = "Ali Parlakci" -__license__ = "GPL" -__version__ = "1.10.0" -__maintainer__ = "Ali Parlakci" -__email__ = "parlakciali@gmail.com" +logger = logging.getLogger() +parser = argparse.ArgumentParser(allow_abbrev=False, + description="This program downloads media from reddit posts") -def postFromLog(filename): - """Analyze a log file and return a list of dictionaries containing - submissions - """ - if Path.is_file(Path(filename)): - content = JsonFile(filename).read() +def _add_options(): + parser.add_argument("directory", + help="Specifies the directory where posts will be downloaded to", + metavar="DIRECTORY") + parser.add_argument("--verbose", "-v", + help="Verbose Mode", + action="store_true", + default=False) + parser.add_argument("--quit", "-q", + help="Auto quit afer the process finishes", + action="store_true", + default=False) + parser.add_argument("--link", "-l", + help="Get posts from link", + metavar="link") + parser.add_argument("--saved", + action="store_true", + required="--unsave" in sys.argv, + help="Triggers saved mode") + parser.add_argument("--unsave", + action="store_true", + help="Unsaves downloaded posts") + parser.add_argument("--submitted", + action="store_true", + help="Gets posts of --user") + parser.add_argument("--upvoted", + action="store_true", + help="Gets upvoted posts of --user") + parser.add_argument("--log", + help="Takes a log file which created by itself (json files),reads posts and tries " + "downloading them again.", + metavar="LOG FILE") + parser.add_argument("--subreddit", + nargs="+", + help="Triggers subreddit mode and takes subreddit's name without r/. use \"frontpage\" " + "for frontpage", + metavar="SUBREDDIT", + type=str) + parser.add_argument("--multireddit", + help="Triggers multireddit mode and takes multireddit's name without m", + metavar="MULTIREDDIT", + type=str) + parser.add_argument("--user", + help="reddit username if needed. use \"me\" for current user", + required="--multireddit" in sys.argv or "--submitted" in sys.argv, + metavar="redditor", + type=str) + parser.add_argument("--search", + help="Searches for given query in given subreddits", + metavar="query", + type=str) + parser.add_argument("--sort", + help="Either hot, top, new, controversial, rising or relevance default: hot", + choices=["hot", "top", "new", "controversial", "rising", "relevance"], + metavar="SORT TYPE", + type=str) + parser.add_argument("--limit", + help="default: unlimited", + metavar="Limit", + type=int) + parser.add_argument("--time", + help="Either hour, day, week, month, year or all. default: all", + choices=["all", "hour", "day", "week", "month", "year"], + metavar="TIME_LIMIT", + type=str) + parser.add_argument("--skip", + nargs="+", + help="Skip posts with given type", + type=str, + choices=["images", "videos", "gifs", "self"], + default=[]) + parser.add_argument("--skip-domain", + nargs="+", + help="Skip posts with given domain", + type=str, + default=[]) + parser.add_argument("--set-folderpath", + action="store_true", + help="Set custom folderpath", + default='{SUBREDDIT}' + ) + parser.add_argument("--set-filename", + action="store_true", + help="Set custom filename", + default='{REDDITOR}_{TITLE}_{POSTID}' + ) + parser.add_argument("--set-default-directory", + action="store_true", + help="Set a default directory to be used in case no directory is given", + ) + parser.add_argument("--set-default-options", + action="store_true", + help="Set default options to use everytime program runs", + ) + parser.add_argument("--use-local-config", + action="store_true", + help="Creates a config file in the program's directory" + " and uses it. Useful for having multiple configs", + ) + parser.add_argument("--no-dupes", + action="store_true", + help="Do not download duplicate posts on different subreddits", + ) + parser.add_argument("--downloaded-posts", + help="Use a hash file to keep track of downloaded files", + type=str + ) + parser.add_argument("--no-download", + action="store_true", + help="Just saved posts into a the POSTS.json file without downloading" + ) + + +def _setup_logging(verbosity: int): + logger.setLevel(1) + stream = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') + stream.setFormatter(formatter) + logger.addHandler(stream) + if verbosity < 0: + stream.setLevel(logging.INFO) else: - print("File not found") - sys.exit() + stream.setLevel(logging.DEBUG) + logging.getLogger('praw').setLevel(logging.CRITICAL) + logging.getLogger('prawcore').setLevel(logging.CRITICAL) + logging.getLogger('urllib3').setLevel(logging.CRITICAL) + + +def main(args: argparse.Namespace): + _setup_logging(args.verbose) try: - del content["HEADER"] - except KeyError: - pass + reddit_downloader = RedditDownloader(args) + reddit_downloader.download() + except BulkDownloaderException as e: + logger.critical(f'An error occured {e}') - posts = [] - for post in content: - if not content[post][-1]['TYPE'] is None: - posts.append(content[post][-1]) - - return posts - - -def isPostExists(post, directory): - """Figure out a file's name and checks if the file already exists""" - - filename = GLOBAL.config['filename'].format(**post) - - possible_extensions = [".jpg", ".png", ".mp4", ".gif", ".webm", ".md", ".mkv", ".flv"] - - for extension in possible_extensions: - - path = directory / Path(filename + extension) - - if path.exists(): - return True - - return False - - - -def downloadPost(submission, directory): - downloaders = { - "imgur": Imgur, "gfycat": Gfycat, "erome": Erome, "direct": Direct, "self": SelfPost, - "redgifs": Redgifs, "gifdeliverynetwork": GifDeliveryNetwork, - "v.redd.it": VReddit, "youtube": Youtube, "gallery": Gallery - } - - print() - if submission['TYPE'] in downloaders: - downloaders[submission['TYPE']](directory, submission) - else: - raise NoSuitablePost - - -def download(submissions): - """Analyze list of submissions and call the right function - to download each one, catch errors, update the log files - """ - - downloaded_count = 0 - duplicates = 0 - - failed_file = createLogFile("FAILED") - - if GLOBAL.arguments.unsave: - reddit = Reddit(GLOBAL.config['credentials']['reddit']).begin() - - subs_length = len(submissions) - - for i in range(len(submissions)): - print(f"\n({i+1}/{subs_length})", end=" — ") - print(submissions[i]['POSTID'], - f"r/{submissions[i]['SUBREDDIT']}", - f"u/{submissions[i]['REDDITOR']}", - submissions[i]['FLAIR'] if submissions[i]['FLAIR'] else "", - sep=" — ", - end="") - print(f" – {submissions[i]['TYPE'].upper()}", end="", no_print=True) - - directory = GLOBAL.directory / \ - GLOBAL.config["folderpath"].format(**submissions[i]) - details = { - **submissions[i], - **{"TITLE": nameCorrector( - submissions[i]['TITLE'], - reference=str(directory) - + GLOBAL.config['filename'].format(**submissions[i]) - + ".ext")} - } - filename = GLOBAL.config['filename'].format(**details) - - if isPostExists(details, directory): - print() - print(directory) - print(filename) - print("It already exists") - duplicates += 1 - continue - - if any(domain in submissions[i]['CONTENTURL'] for domain in GLOBAL.arguments.skip): - print() - print(submissions[i]['CONTENTURL']) - print("Domain found in skip domains, skipping post...") - continue - - try: - downloadPost(details, directory) - GLOBAL.downloadedPosts.add(details['POSTID']) - - try: - if GLOBAL.arguments.unsave: - reddit.submission(id=details['POSTID']).unsave() - except InsufficientScope: - reddit = Reddit().begin() - reddit.submission(id=details['POSTID']).unsave() - - downloaded_count += 1 - - except FileAlreadyExistsError: - print("It already exists") - GLOBAL.downloadedPosts.add(details['POSTID']) - duplicates += 1 - - except ImgurLoginError: - print("Imgur login failed. \nQuitting the program as unexpected errors might occur.") - sys.exit() - - except ImgurLimitError as exception: - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)), details - ]}) - - except NotADownloadableLinkError as exception: - print("{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))) - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)), - submissions[i] - ]}) - - except TypeInSkip: - print() - print(submissions[i]['CONTENTURL']) - print("Skipping post...") - - except DomainInSkip: - print() - print(submissions[i]['CONTENTURL']) - print("Skipping post...") - - except NoSuitablePost: - print("No match found, skipping...") - - except FailedToDownload: - print("Failed to download the posts, skipping...") - except AlbumNotDownloadedCompletely: - print("Album did not downloaded completely.") - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exc.__class__.__name__, info=str(exc)), - submissions[i] - ]}) - - except Exception as exc: - print("{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exc.__class__.__name__, info=str(exc)) - ) - - logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue(), no_print=True) - - failed_file.add({int(i + 1): [ - "{class_name}: {info}".format(class_name=exc.__class__.__name__, info=str(exc)), - submissions[i] - ]}) - - if duplicates: - print(f"\nThere {'were' if duplicates > 1 else 'was'} {duplicates} duplicate{'s' if duplicates > 1 else ''}") - - if downloaded_count == 0: - print("Nothing is downloaded :(") - - else: - print(f"Total of {downloaded_count} link{'s' if downloaded_count > 1 else ''} downloaded!") - - -def printLogo(): - VanillaPrint(f"\nBulk Downloader for Reddit v{__version__}\n" - f"Written by Ali PARLAKCI – parlakciali@gmail.com\n\n" - f"https://github.com/aliparlakci/bulk-downloader-for-reddit/\n" - ) - - -def main(): - if Path("config.json").exists(): - GLOBAL.configDirectory = Path("config.json") - else: - if not Path(GLOBAL.defaultConfigDirectory).is_dir(): - os.makedirs(GLOBAL.defaultConfigDirectory) - GLOBAL.configDirectory = GLOBAL.defaultConfigDirectory / "config.json" - try: - GLOBAL.config = Config(GLOBAL.configDirectory).generate() - except InvalidJSONFile as exception: - VanillaPrint(str(exception.__class__.__name__), ">>", str(exception)) - VanillaPrint("Resolve it or remove it to proceed") - sys.exit() - - sys.argv = sys.argv + GLOBAL.config["options"].split() - - arguments = Arguments.parse() - GLOBAL.arguments = arguments - - if arguments.set_filename: - Config(GLOBAL.configDirectory).setCustomFileName() - sys.exit() - - if arguments.set_folderpath: - Config(GLOBAL.configDirectory).setCustomFolderPath() - sys.exit() - - if arguments.set_default_directory: - Config(GLOBAL.configDirectory).setDefaultDirectory() - sys.exit() - - if arguments.set_default_options: - Config(GLOBAL.configDirectory).setDefaultOptions() - sys.exit() - - if arguments.use_local_config: - JsonFile("config.json").add(GLOBAL.config) - sys.exit() - - if arguments.directory: - GLOBAL.directory = Path(arguments.directory.strip()) - elif "default_directory" in GLOBAL.config and GLOBAL.config["default_directory"] != "": - GLOBAL.directory = Path( - GLOBAL.config["default_directory"].format(time=GLOBAL.RUN_TIME)) - else: - GLOBAL.directory = Path(input("\ndownload directory: ").strip()) - - if arguments.downloaded_posts: - GLOBAL.downloadedPosts = Store(arguments.downloaded_posts) - else: - GLOBAL.downloadedPosts = Store() - - printLogo() - print("\n", " ".join(sys.argv), "\n", no_print=True) - - if arguments.log is not None: - log_dir = Path(arguments.log) - download(postFromLog(log_dir)) - sys.exit() - - program_mode = ProgramMode(arguments).generate() - - try: - posts = getPosts(program_mode) - except Exception as exc: - logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue(), no_print=True) - print(exc) - sys.exit() - - if posts is None: - print("I could not find any posts in that URL") - sys.exit() - - if GLOBAL.arguments.no_download: - pass - else: - download(posts) - - -if __name__ == "__main__": - - GLOBAL.log_stream = StringIO() - logging.basicConfig(stream=GLOBAL.log_stream, level=logging.INFO) - - try: - VanillaPrint = print - print = printToFile - GLOBAL.RUN_TIME = str(time.strftime("%d-%m-%Y_%H-%M-%S", time.localtime(time.time()))) - main() - - except KeyboardInterrupt: - if GLOBAL.directory is None: - GLOBAL.directory = Path("../..\\") - - except Exception as exception: - if GLOBAL.directory is None: - GLOBAL.directory = Path("../..\\") - logging.error(sys.exc_info()[0].__name__, exc_info=full_exc_info(sys.exc_info())) - print(GLOBAL.log_stream.getvalue()) - - if not GLOBAL.arguments.quit: - input("\nPress enter to quit\n") +if __name__ == '__main__': + _add_options() + args = parser.parse_args() + main(args) diff --git a/bulkredditdownloader/arguments.py b/bulkredditdownloader/arguments.py deleted file mode 100644 index cbf72c7..0000000 --- a/bulkredditdownloader/arguments.py +++ /dev/null @@ -1,153 +0,0 @@ -import argparse -import sys - - -class Arguments: - @staticmethod - def parse(arguments=None): - """Initialize argparse and add arguments""" - if arguments is None: - arguments = [] - - parser = argparse.ArgumentParser(allow_abbrev=False, - description="This program downloads media from reddit posts") - parser.add_argument("--directory", "-d", - help="Specifies the directory where posts will be downloaded to", - metavar="DIRECTORY") - - parser.add_argument("--verbose", "-v", - help="Verbose Mode", - action="store_true", - default=False) - - parser.add_argument("--quit", "-q", - help="Auto quit afer the process finishes", - action="store_true", - default=False) - - parser.add_argument("--link", "-l", - help="Get posts from link", - metavar="link") - - parser.add_argument("--saved", - action="store_true", - required="--unsave" in sys.argv, - help="Triggers saved mode") - - parser.add_argument("--unsave", - action="store_true", - help="Unsaves downloaded posts") - - parser.add_argument("--submitted", - action="store_true", - help="Gets posts of --user") - - parser.add_argument("--upvoted", - action="store_true", - help="Gets upvoted posts of --user") - - parser.add_argument("--log", - help="Takes a log file which created by itself (json files),reads posts and tries " - "downloading them again.", - # type=argparse.FileType('r'), - metavar="LOG FILE") - - parser.add_argument("--subreddit", - nargs="+", - help="Triggers subreddit mode and takes subreddit's name without r/. use \"frontpage\" " - "for frontpage", - metavar="SUBREDDIT", - type=str) - - parser.add_argument("--multireddit", - help="Triggers multireddit mode and takes multireddit's name without m", - metavar="MULTIREDDIT", - type=str) - - parser.add_argument("--user", - help="reddit username if needed. use \"me\" for current user", - required="--multireddit" in sys.argv or "--submitted" in sys.argv, - metavar="redditor", - type=str) - - parser.add_argument( - "--search", - help="Searches for given query in given subreddits", - metavar="query", - type=str) - - parser.add_argument("--sort", - help="Either hot, top, new, controversial, rising or relevance default: hot", - choices=["hot", "top", "new", "controversial", "rising", "relevance"], - metavar="SORT TYPE", - type=str) - - parser.add_argument("--limit", - help="default: unlimited", - metavar="Limit", - type=int) - - parser.add_argument("--time", - help="Either hour, day, week, month, year or all. default: all", - choices=["all", "hour", "day", "week", "month", "year"], - metavar="TIME_LIMIT", - type=str) - - parser.add_argument("--skip", - nargs="+", - help="Skip posts with given type", - type=str, - choices=["images", "videos", "gifs", "self"], - default=[]) - - parser.add_argument("--skip-domain", - nargs="+", - help="Skip posts with given domain", - type=str, - default=[]) - - parser.add_argument("--set-folderpath", - action="store_true", - help="Set custom folderpath" - ) - - parser.add_argument("--set-filename", - action="store_true", - help="Set custom filename", - ) - - parser.add_argument("--set-default-directory", - action="store_true", - help="Set a default directory to be used in case no directory is given", - ) - - parser.add_argument("--set-default-options", - action="store_true", - help="Set default options to use everytime program runs", - ) - - parser.add_argument("--use-local-config", - action="store_true", - help="Creates a config file in the program's directory" - " and uses it. Useful for having multiple configs", - ) - - parser.add_argument("--no-dupes", - action="store_true", - help="Do not download duplicate posts on different subreddits", - ) - - parser.add_argument("--downloaded-posts", - help="Use a hash file to keep track of downloaded files", - type=str - ) - - parser.add_argument("--no-download", - action="store_true", - help="Just saved posts into a the POSTS.json file without downloading" - ) - - if not arguments: - return parser.parse_args() - else: - return parser.parse_args(arguments) diff --git a/bulkredditdownloader/config.py b/bulkredditdownloader/config.py deleted file mode 100644 index 36dec10..0000000 --- a/bulkredditdownloader/config.py +++ /dev/null @@ -1,109 +0,0 @@ -from bulkredditdownloader.reddit import Reddit -from bulkredditdownloader.json_helper import JsonFile -from bulkredditdownloader.utils import nameCorrector - - -class Config: - - def __init__(self, filename: str): - self.filename = filename - self.file = JsonFile(self.filename) - - def generate(self) -> dict: - self._validateCredentials() - self._readCustomFileName() - self._readCustomFolderPath() - self._readDefaultOptions() - return self.file.read() - - def setCustomFileName(self): - print(""" -IMPORTANT: Do not change the filename structure frequently. - If you did, the program could not find duplicates and - would download the already downloaded files again. - This would not create any duplicates in the directory but - the program would not be as snappy as it should be. - -Type a template file name for each post. - -You can use SUBREDDIT, REDDITOR, POSTID, TITLE, UPVOTES, FLAIR, DATE in curly braces -The text in curly braces will be replaced with the corresponding property of an each post - -For example: {FLAIR}_{SUBREDDIT}_{REDDITOR} - -Existing filename template:""", None if "filename" not in self.file.read() else self.file.read()["filename"]) - - filename = nameCorrector(input(">> ").upper()) - self.file.add({"filename": filename}) - - def _readCustomFileName(self): - content = self.file.read() - - if "filename" not in content: - self.file.add({"filename": "{REDDITOR}_{TITLE}_{POSTID}"}) - content = self.file.read() - - if "{POSTID}" not in content["filename"]: - self.file.add({"filename": content["filename"] + "_{POSTID}"}) - - def setCustomFolderPath(self): - print(""" -Type a folder structure (generic folder path) - -Use slash or DOUBLE backslash to separate folders - -You can use SUBREDDIT, REDDITOR, POSTID, TITLE, UPVOTES, FLAIR, DATE in curly braces -The text in curly braces will be replaced with the corresponding property of an each post - -For example: {REDDITOR}/{SUBREDDIT}/{FLAIR} - -Existing folder structure""", None if "folderpath" not in self.file.read() else self.file.read()["folderpath"]) - - folderpath = nameCorrector(input(">> ").strip("\\").strip("/").upper()) - - self.file.add({"folderpath": folderpath}) - - def _readCustomFolderPath(self, path=None): - content = self.file.read() - if "folderpath" not in content: - self.file.add({"folderpath": "{SUBREDDIT}"}) - - def setDefaultOptions(self): - print(""" -Type options to be used everytime script runs - -For example: --no-dupes --quit --limit 100 --skip youtube.com - -Existing default options:""", None if "options" not in self.file.read() else self.file.read()["options"]) - - options = input(">> ").strip("") - - self.file.add({"options": options}) - - def _readDefaultOptions(self): - content = self.file.read() - if "options" not in content: - self.file.add({"options": ""}) - - def _validateCredentials(self): - """Read credentials from config.json file""" - try: - content = self.file.read()["credentials"] - except BaseException: - self.file.add({"credentials": {}}) - content = self.file.read()["credentials"] - - if "reddit" in content and len(content["reddit"]) != 0: - pass - else: - Reddit().begin() - print() - - def setDefaultDirectory(self): - print("""Set a default directory to use in case no directory is given -Leave blank to reset it. You can use {time} in foler names to use to timestamp it -For example: D:/archive/BDFR_{time} -""") - print("Current default directory:", self.file.read()[ - "default_directory"] if "default_directory" in self.file.read() else "") - self.file.add({"default_directory": input(">> ")}) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py new file mode 100644 index 0000000..fc10ee0 --- /dev/null +++ b/bulkredditdownloader/downloader.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import argparse +import configparser +import logging +import socket +from datetime import datetime +from enum import Enum, auto +from pathlib import Path + +import appdirs +import praw +import praw.models + +from bulkredditdownloader.download_filter import DownloadFilter +from bulkredditdownloader.errors import NotADownloadableLinkError, RedditAuthenticationError +from bulkredditdownloader.file_name_formatter import FileNameFormatter +from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory + +logger = logging.getLogger(__name__) + + +class RedditTypes: + class SortType(Enum): + HOT = auto() + RISING = auto() + CONTROVERSIAL = auto() + NEW = auto() + RELEVENCE = auto() + + class TimeType(Enum): + HOUR = auto() + DAY = auto() + WEEK = auto() + MONTH = auto() + YEAR = auto() + ALL = auto() + + +class RedditDownloader: + def __init__(self, args: argparse.Namespace): + self.config_directories = appdirs.AppDirs('bulk_reddit_downloader') + self.run_time = datetime.now().isoformat() + self._setup_internal_objects(args) + + self.reddit_lists = self._retrieve_reddit_lists(args) + + def _setup_internal_objects(self, args: argparse.Namespace): + self.download_filter = RedditDownloader._create_download_filter(args) + self.time_filter = RedditDownloader._create_time_filter(args) + self.sort_filter = RedditDownloader._create_sort_filter(args) + self.file_name_formatter = RedditDownloader._create_file_name_formatter(args) + self._determine_directories(args) + self.master_hash_list = [] + self._load_config(args) + if self.cfg_parser.has_option('DEFAULT', 'username') and self.cfg_parser.has_option('DEFAULT', 'password'): + self.authenticated = True + + self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + username=self.cfg_parser.get('DEFAULT', 'username'), + password=self.cfg_parser.get('DEFAULT', 'password')) + else: + self.authenticated = False + self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname()) + + def _retrieve_reddit_lists(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + master_list = [] + master_list.extend(self._get_subreddits(args)) + master_list.extend(self._get_multireddits(args)) + master_list.extend(self._get_user_data(args)) + return master_list + + def _determine_directories(self, args: argparse.Namespace): + self.download_directory = Path(args.directory) + self.logfile_directory = self.download_directory / 'LOG_FILES' + self.config_directory = self.config_directories.user_config_dir + + def _load_config(self, args: argparse.Namespace): + self.cfg_parser = configparser.ConfigParser() + if args.use_local_config and Path('./config.cfg').exists(): + self.cfg_parser.read(Path('./config.cfg')) + else: + self.cfg_parser.read(Path('./default_config.cfg').resolve()) + + def _get_subreddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + if args.subreddit: + subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in args.subreddit] + if self.sort_filter is RedditTypes.SortType.NEW: + sort_function = praw.models.Subreddit.new + elif self.sort_filter is RedditTypes.SortType.RISING: + sort_function = praw.models.Subreddit.rising + elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: + sort_function = praw.models.Subreddit.controversial + else: + sort_function = praw.models.Subreddit.hot + return [sort_function(reddit) for reddit in subreddits] + else: + return [] + + def _get_multireddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + if args.multireddit: + if self.authenticated: + return [self.reddit_instance.multireddit(m_reddit_choice) for m_reddit_choice in args.multireddit] + else: + raise RedditAuthenticationError('Accessing multireddits requires authentication') + else: + return [] + + def _get_user_data(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + if any((args.upvoted, args.submitted, args.saved)): + if self.authenticated: + generators = [] + if args.upvoted: + generators.append(self.reddit_instance.redditor(args.user).upvoted) + if args.submitted: + generators.append(self.reddit_instance.redditor(args.user).submissions) + if args.saved: + generators.append(self.reddit_instance.redditor(args.user).saved) + + return generators + else: + raise RedditAuthenticationError('Accessing user lists requires authentication') + else: + return [] + + @staticmethod + def _create_file_name_formatter(args: argparse.Namespace) -> FileNameFormatter: + return FileNameFormatter(args.set_filename, args.set_folderpath) + + @staticmethod + def _create_time_filter(args: argparse.Namespace) -> RedditTypes.TimeType: + try: + return RedditTypes.TimeType[args.sort.upper()] + except (KeyError, AttributeError): + return RedditTypes.TimeType.ALL + + @staticmethod + def _create_sort_filter(args: argparse.Namespace) -> RedditTypes.SortType: + try: + return RedditTypes.SortType[args.time.upper()] + except (KeyError, AttributeError): + return RedditTypes.SortType.HOT + + @staticmethod + def _create_download_filter(args: argparse.Namespace) -> DownloadFilter: + formats = { + "videos": [".mp4", ".webm"], + "images": [".jpg", ".jpeg", ".png", ".bmp"], + "gifs": [".gif"], + "self": [] + } + excluded_extensions = [extension for ext_type in args.skip for extension in formats.get(ext_type, ())] + return DownloadFilter(excluded_extensions, args.skip_domain) + + def download(self): + for generator in self.reddit_lists: + for submission in generator: + self._download_submission(submission) + + def _download_submission(self, submission: praw.models.Submission): + # TODO: check existence here + if self.download_filter.check_url(submission.url): + try: + downloader_class = DownloadFactory.pull_lever(submission.url) + downloader = downloader_class(self.download_directory, submission) + content = downloader.download() + for res in content: + destination = self.file_name_formatter.format_path(res, self.download_directory) + if res.hash.hexdigest() not in self.master_hash_list: + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug('Written file to {}'.format(destination)) + self.master_hash_list.append(res.hash.hexdigest()) + logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) + + logger.info('Downloaded submission {}'.format(submission.name)) + except NotADownloadableLinkError as e: + logger.error('Could not download submission {}: {}'.format(submission.name, e)) diff --git a/bulkredditdownloader/errors.py b/bulkredditdownloader/errors.py index 7bf47b9..d7c5041 100644 --- a/bulkredditdownloader/errors.py +++ b/bulkredditdownloader/errors.py @@ -1,137 +1,28 @@ -import sys +#!/usr/bin/env - -def full_exc_info(exc_info): - - def current_stack(skip=0): - try: - 1 / 0 - except ZeroDivisionError: - f = sys.exc_info()[2].tb_frame - for i in range(skip + 2): - f = f.f_back - lst = [] - while f is not None: - lst.append((f, f.f_lineno)) - f = f.f_back - return lst - - def extend_traceback(tb, stack): - - class FauxTb(): - def __init__(self, tb_frame, tb_lineno, tb_next): - self.tb_frame = tb_frame - self.tb_lineno = tb_lineno - self.tb_next = tb_next - - """Extend traceback with stack info.""" - head = tb - for tb_frame, tb_lineno in stack: - head = FauxTb(tb_frame, tb_lineno, head) - return head - - """Like sys.exc_info, but includes the full traceback.""" - t, v, tb = exc_info - full_tb = extend_traceback(tb, current_stack(1)) - return t, v, full_tb - - -class RedditLoginFailed(Exception): +class BulkDownloaderException(Exception): pass -class ImgurLoginError(Exception): +class NotADownloadableLinkError(BulkDownloaderException): pass -class FileAlreadyExistsError(Exception): +class RedditAuthenticationError(BulkDownloaderException): pass -class NotADownloadableLinkError(Exception): +class InvalidJSONFile(BulkDownloaderException): pass -class AlbumNotDownloadedCompletely(Exception): +class FailedToDownload(BulkDownloaderException): pass -class FileNameTooLong(Exception): +class ImageNotFound(BulkDownloaderException): pass -class InvalidRedditLink(Exception): - pass - - -class ProgramModeError(Exception): - pass - - -class SearchModeError(Exception): - pass - - -class RedditorNameError(Exception): - pass - - -class NoMatchingSubmissionFound(Exception): - pass - - -class NoPrawSupport(Exception): - pass - - -class NoRedditSupport(Exception): - pass - - -class MultiredditNotFound(Exception): - pass - - -class InsufficientPermission(Exception): - pass - - -class InvalidSortingType(Exception): - pass - - - -class NoSuitablePost(Exception): - pass - - -class ImgurLimitError(Exception): - pass - - -class DirectLinkNotFound(Exception): - pass - - -class InvalidJSONFile(Exception): - pass - - -class FailedToDownload(Exception): - pass - - -class TypeInSkip(Exception): - pass - - -class DomainInSkip(Exception): - pass - - -class ImageNotFound(Exception): - pass - - -class ExtensionError(Exception): +class ExtensionError(BulkDownloaderException): pass diff --git a/bulkredditdownloader/parser.py b/bulkredditdownloader/parser.py deleted file mode 100644 index e8a38f7..0000000 --- a/bulkredditdownloader/parser.py +++ /dev/null @@ -1,234 +0,0 @@ -from pprint import pprint - -try: - from bulkredditdownloader.errors import InvalidRedditLink -except ModuleNotFoundError: - from errors import InvalidRedditLink - - -def QueryParser(passed_queries: str) -> dict: - extracted_queries = {} - - question_mark_index = passed_queries.index("?") - header = passed_queries[:question_mark_index] - extracted_queries["HEADER"] = header - queries = passed_queries[question_mark_index + 1:] - - parsed_queries = queries.split("&") - - for query in parsed_queries: - query = query.split("=") - extracted_queries[query[0]] = query[1] - - if extracted_queries["HEADER"] == "search": - extracted_queries["q"] = extracted_queries["q"].replace("%20", " ") - - return extracted_queries - - -def LinkParser(link: str) -> dict: - result = {} - short_link = False - - if "reddit.com" not in link: - raise InvalidRedditLink("Invalid reddit link") - - splitted_link = link.split("/") - - if splitted_link[0] == "https:" or splitted_link[0] == "http:": - splitted_link = splitted_link[2:] - - try: - if (splitted_link[-2].endswith("reddit.com") and - splitted_link[-1] == "") or splitted_link[-1].endswith("reddit.com"): - - result["sort"] = "best" - return result - except IndexError: - if splitted_link[0].endswith("reddit.com"): - result["sort"] = "best" - return result - - if "redd.it" in splitted_link: - short_link = True - - if splitted_link[0].endswith("reddit.com"): - splitted_link = splitted_link[1:] - - if "comments" in splitted_link: - result = {"post": link} - return result - - elif "me" in splitted_link or \ - "u" in splitted_link or \ - "user" in splitted_link or \ - "r" in splitted_link or \ - "m" in splitted_link: - - if "r" in splitted_link: - result["subreddit"] = splitted_link[splitted_link.index("r") + 1] - - elif "m" in splitted_link: - result["multireddit"] = splitted_link[splitted_link.index("m") + 1] - result["user"] = splitted_link[splitted_link.index("m") - 1] - - else: - for index in range(len(splitted_link)): - if splitted_link[index] == "u" or splitted_link[index] == "user": - result["user"] = splitted_link[index + 1] - - elif splitted_link[index] == "me": - result["user"] = "me" - - for index in range(len(splitted_link)): - if splitted_link[index] in ["hot", "top", "new", "controversial", "rising"]: - - result["sort"] = splitted_link[index] - - if index == 0: - result["subreddit"] = "frontpage" - - elif splitted_link[index] in ["submitted", "saved", "posts", "upvoted"]: - if splitted_link[index] == "submitted" or splitted_link[index] == "posts": - result["submitted"] = {} - - elif splitted_link[index] == "saved": - result["saved"] = True - - elif splitted_link[index] == "upvoted": - result["upvoted"] = True - - elif "?" in splitted_link[index]: - parsed_query = QueryParser(splitted_link[index]) - if parsed_query["HEADER"] == "search": - del parsed_query["HEADER"] - result["search"] = parsed_query - - elif parsed_query["HEADER"] == "submitted" or \ - parsed_query["HEADER"] == "posts": - del parsed_query["HEADER"] - result["submitted"] = parsed_query - - else: - del parsed_query["HEADER"] - result["queries"] = parsed_query - - if not ("upvoted" in result or - "saved" in result or - "submitted" in result or - "multireddit" in result) and "user" in result: - result["submitted"] = {} - - return result - - -def LinkDesigner(link) -> dict: - attributes = LinkParser(link) - mode = {} - - if "post" in attributes: - mode["post"] = attributes["post"] - mode["sort"] = "" - mode["time"] = "" - return mode - - elif "search" in attributes: - mode["search"] = attributes["search"]["q"] - - if "restrict_sr" in attributes["search"]: - - if not (attributes["search"]["restrict_sr"] == 0 or - attributes["search"]["restrict_sr"] == "off" or - attributes["search"]["restrict_sr"] == ""): - - if "subreddit" in attributes: - mode["subreddit"] = attributes["subreddit"] - elif "multireddit" in attributes: - mode["multreddit"] = attributes["multireddit"] - mode["user"] = attributes["user"] - else: - mode["subreddit"] = "all" - else: - mode["subreddit"] = "all" - - if "t" in attributes["search"]: - mode["time"] = attributes["search"]["t"] - else: - mode["time"] = "all" - - if "sort" in attributes["search"]: - mode["sort"] = attributes["search"]["sort"] - else: - mode["sort"] = "relevance" - - if "include_over_18" in attributes["search"]: - if attributes["search"]["include_over_18"] == 1 or attributes["search"]["include_over_18"] == "on": - mode["nsfw"] = True - else: - mode["nsfw"] = False - - else: - if "queries" in attributes: - if not ("submitted" in attributes or "posts" in attributes): - - if "t" in attributes["queries"]: - mode["time"] = attributes["queries"]["t"] - else: - mode["time"] = "day" - else: - if "t" in attributes["queries"]: - mode["time"] = attributes["queries"]["t"] - else: - mode["time"] = "all" - - if "sort" in attributes["queries"]: - mode["sort"] = attributes["queries"]["sort"] - else: - mode["sort"] = "new" - else: - mode["time"] = "day" - - if "subreddit" in attributes and "search" not in attributes: - mode["subreddit"] = attributes["subreddit"] - - elif "user" in attributes and "search" not in attributes: - mode["user"] = attributes["user"] - - if "submitted" in attributes: - mode["submitted"] = True - if "sort" in attributes["submitted"]: - mode["sort"] = attributes["submitted"]["sort"] - elif "sort" in mode: - pass - else: - mode["sort"] = "new" - - if "t" in attributes["submitted"]: - mode["time"] = attributes["submitted"]["t"] - else: - mode["time"] = "all" - - elif "saved" in attributes: - mode["saved"] = True - - elif "upvoted" in attributes: - mode["upvoted"] = True - - elif "multireddit" in attributes: - mode["multireddit"] = attributes["multireddit"] - - if "sort" in attributes: - mode["sort"] = attributes["sort"] - elif "sort" in mode: - pass - else: - mode["sort"] = "hot" - - return mode - - - -if __name__ == "__main__": - while True: - link = input("> ") - pprint(LinkDesigner(link)) diff --git a/bulkredditdownloader/program_mode.py b/bulkredditdownloader/program_mode.py deleted file mode 100644 index f2361ac..0000000 --- a/bulkredditdownloader/program_mode.py +++ /dev/null @@ -1,241 +0,0 @@ -import sys -from pathlib import Path - -from bulkredditdownloader.errors import InvalidSortingType, ProgramModeError, RedditorNameError, SearchModeError -from bulkredditdownloader.parser import LinkDesigner -import argparse - - - -class ProgramMode: - - def __init__(self, arguments: argparse.Namespace): - self.arguments = arguments - - def generate(self) -> dict: - try: - self._validateProgramMode() - except ProgramModeError: - self._promptUser() - - program_mode = {} - - if self.arguments.user is not None: - program_mode["user"] = self.arguments.user - - if self.arguments.search is not None: - program_mode["search"] = self.arguments.search - if self.arguments.sort == "hot" or \ - self.arguments.sort == "controversial" or \ - self.arguments.sort == "rising": - self.arguments.sort = "relevance" - - if self.arguments.sort is not None: - program_mode["sort"] = self.arguments.sort - else: - if self.arguments.submitted: - program_mode["sort"] = "new" - else: - program_mode["sort"] = "hot" - - if self.arguments.time is not None: - program_mode["time"] = self.arguments.time - else: - program_mode["time"] = "all" - - if self.arguments.link is not None: - self.arguments.link = self.arguments.link.strip("\"") - - program_mode = LinkDesigner(self.arguments.link) - - if self.arguments.search is not None: - program_mode["search"] = self.arguments.search - - if self.arguments.sort is not None: - program_mode["sort"] = self.arguments.sort - - if self.arguments.time is not None: - program_mode["time"] = self.arguments.time - - elif self.arguments.subreddit is not None: - if isinstance(self.arguments.subreddit, list): - self.arguments.subreddit = "+".join(self.arguments.subreddit) - - program_mode["subreddit"] = self.arguments.subreddit - - elif self.arguments.multireddit is not None: - program_mode["multireddit"] = self.arguments.multireddit - - elif self.arguments.saved is True: - program_mode["saved"] = True - - elif self.arguments.upvoted is True: - program_mode["upvoted"] = True - - elif self.arguments.submitted is not None: - program_mode["submitted"] = True - - if self.arguments.sort == "rising": - raise InvalidSortingType("Invalid sorting type has given") - - program_mode["limit"] = self.arguments.limit - - return program_mode - - @staticmethod - def _chooseFrom(choices: list[str]): - print() - choices_by_index = list(str(x) for x in range(len(choices) + 1)) - for i in range(len(choices)): - print("{indent}[{order}] {mode}".format(indent=" " * 4, order=i + 1, mode=choices[i])) - print(" " * 4 + "[0] exit\n") - choice = input("> ") - while not choice.lower() in choices + choices_by_index + ["exit"]: - print("Invalid input\n") - input("> ") - - if choice == "0" or choice == "exit": - sys.exit() - elif choice in choices_by_index: - return choices[int(choice) - 1] - else: - return choice - - def _promptUser(self): - print("select program mode:") - program_modes = ["search", "subreddit", "multireddit", "submitted", "upvoted", "saved", "log"] - program_mode = self._chooseFrom(program_modes) - - if program_mode == "search": - self.arguments.search = input("\nquery: ") - self.arguments.subreddit = input("\nsubreddit: ") - - print("\nselect sort type:") - sort_types = ["relevance", "top", "new"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - - if program_mode == "subreddit": - subreddit_input = input("(type frontpage for all subscribed subreddits,\n" - " use plus to seperate multi subreddits:" - " pics+funny+me_irl etc.)\n\n" - "subreddit: ") - self.arguments.subreddit = subreddit_input - - if " " in self.arguments.subreddit: - self.arguments.subreddit = "+".join( - self.arguments.subreddit.split()) - - # DELETE THE PLUS (+) AT THE END - if not subreddit_input.lower() == "frontpage" and self.arguments.subreddit[-1] == "+": - self.arguments.subreddit = self.arguments.subreddit[:-1] - - print("\nselect sort type:") - sort_types = ["hot", "top", "new", "rising", "controversial"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - if sort_type in ["top", "controversial"]: - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - else: - self.arguments.time = "all" - - elif program_mode == "multireddit": - self.arguments.user = input("\nmultireddit owner: ") - self.arguments.multireddit = input("\nmultireddit: ") - - print("\nselect sort type:") - sort_types = ["hot", "top", "new", "rising", "controversial"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - if sort_type in ["top", "controversial"]: - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - else: - self.arguments.time = "all" - - elif program_mode == "submitted": - self.arguments.submitted = True - self.arguments.user = input("\nredditor: ") - - print("\nselect sort type:") - sort_types = ["hot", "top", "new", "controversial"] - sort_type = self._chooseFrom(sort_types) - self.arguments.sort = sort_type - - if sort_type == "top": - print("\nselect time filter:") - time_filters = ["hour", "day", "week", "month", "year", "all"] - time_filter = self._chooseFrom(time_filters) - self.arguments.time = time_filter - else: - self.arguments.time = "all" - - elif program_mode == "upvoted": - self.arguments.upvoted = True - self.arguments.user = input("\nredditor: ") - - elif program_mode == "saved": - self.arguments.saved = True - - elif program_mode == "log": - while True: - self.arguments.log = input("\nlog file directory:") - if Path(self.arguments.log).is_file(): - break - while True: - try: - self.arguments.limit = int(input("\nlimit (0 for none): ")) - if self.arguments.limit == 0: - self.arguments.limit = None - break - except ValueError: - pass - - def _validateProgramMode(self): - """Check if command-line self.arguments are given correcly, - if not, raise errors - """ - if self.arguments.user is None: - user = 0 - else: - user = 1 - - search = 1 if self.arguments.search else 0 - - modes = ["saved", "subreddit", "submitted", "log", "link", "upvoted", "multireddit"] - - values = {x: 0 if getattr(self.arguments, x) is None or - getattr(self.arguments, x) is False - else 1 - for x in modes - } - - if not sum(values[x] for x in values) == 1: - raise ProgramModeError("Invalid program mode") - - if search + values["saved"] == 2: - raise SearchModeError("You cannot search in your saved posts") - - if search + values["submitted"] == 2: - raise SearchModeError("You cannot search in submitted posts") - - if search + values["upvoted"] == 2: - raise SearchModeError("You cannot search in upvoted posts") - - if search + values["log"] == 2: - raise SearchModeError("You cannot search in log files") - - if values["upvoted"] + values["submitted"] == 1 and user == 0: - raise RedditorNameError("No redditor name given") diff --git a/bulkredditdownloader/reddit.py b/bulkredditdownloader/reddit.py deleted file mode 100644 index 392b860..0000000 --- a/bulkredditdownloader/reddit.py +++ /dev/null @@ -1,91 +0,0 @@ -import random -import socket -import webbrowser - -import praw -from prawcore.exceptions import ResponseException - -from bulkredditdownloader.errors import RedditLoginFailed -from bulkredditdownloader.json_helper import JsonFile -from bulkredditdownloader.utils import GLOBAL - - - -class Reddit: - - def __init__(self, refresh_token: str = None): - self.SCOPES = ['identity', 'history', 'read', 'save'] - self.PORT = 7634 - self.refresh_token = refresh_token - self.redditInstance = None - self.arguments = { - "client_id": GLOBAL.reddit_client_id, - "client_secret": GLOBAL.reddit_client_secret, - "user_agent": str(socket.gethostname()) - } - - def begin(self) -> praw.Reddit: - if self.refresh_token: - self.arguments["refresh_token"] = self.refresh_token - self.redditInstance = praw.Reddit(**self.arguments) - try: - self.redditInstance.auth.scopes() - return self.redditInstance - except ResponseException: - self.arguments["redirect_uri"] = "http://localhost:" + \ - str(self.PORT) - self.redditInstance = praw.Reddit(**self.arguments) - reddit, refresh_token = self.getRefreshToken(*self.SCOPES) - else: - self.arguments["redirect_uri"] = "http://localhost:" + \ - str(self.PORT) - self.redditInstance = praw.Reddit(**self.arguments) - reddit, refresh_token = self.getRefreshToken(*self.SCOPES) - - JsonFile(GLOBAL.configDirectory).add({"reddit_username": str( - reddit.user.me()), "reddit": refresh_token}, "credentials") - return self.redditInstance - - def recieve_connection(self) -> socket: - """Wait for and then return a connected socket.. - Opens a TCP connection on port 8080, and waits for a single client. - """ - server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(('0.0.0.0', self.PORT)) - server.listen(1) - client = server.accept()[0] - server.close() - return client - - def send_message(self, client: socket, message: str): - """Send message to client and close the connection.""" - client.send('HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8')) - client.close() - - def getRefreshToken(self, scopes: list[str]) -> tuple[praw.Reddit, str]: - state = str(random.randint(0, 65000)) - url = self.redditInstance.auth.url(scopes, state, 'permanent') - print("---Setting up the Reddit API---\n") - print("Go to this URL and login to reddit:\n", url, sep="\n", end="\n\n") - webbrowser.open(url, new=2) - - client = self.recieve_connection() - data = client.recv(1024).decode('utf-8') - str(data) - param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&') - params = {key: value for (key, value) in [token.split('=') for token in param_tokens]} - if state != params['state']: - self.send_message(client, 'State mismatch. Expected: {} Received: {}'.format(state, params['state'])) - raise RedditLoginFailed - if 'error' in params: - self.send_message(client, params['error']) - raise RedditLoginFailed - - refresh_token = self.redditInstance.auth.authorize(params['code']) - self.send_message(client, - "" - ) - return self.redditInstance, refresh_token diff --git a/bulkredditdownloader/searcher.py b/bulkredditdownloader/searcher.py deleted file mode 100644 index 19bf1d3..0000000 --- a/bulkredditdownloader/searcher.py +++ /dev/null @@ -1,341 +0,0 @@ -import sys -import time -import urllib.request -from urllib.error import HTTPError - -from prawcore.exceptions import Forbidden, NotFound - -from bulkredditdownloader.errors import (InsufficientPermission, InvalidSortingType, MultiredditNotFound, NoMatchingSubmissionFound, - NoPrawSupport) -from bulkredditdownloader.reddit import Reddit -from praw.models.listing.generator import ListingGenerator -from bulkredditdownloader.utils import GLOBAL, createLogFile, printToFile -from praw.models import Submission - -print = printToFile - - -def getPosts(program_mode: dict) -> list[dict]: - """Call PRAW regarding to arguments and pass it to extractDetails. - Return what extractDetails has returned. - """ - reddit = Reddit(GLOBAL.config["credentials"]["reddit"]).begin() - - if program_mode["sort"] == "best": - raise NoPrawSupport("PRAW does not support that") - - if "subreddit" in program_mode: - if "search" in program_mode: - if program_mode["subreddit"] == "frontpage": - program_mode["subreddit"] = "all" - - if "user" in program_mode: - if program_mode["user"] == "me": - program_mode["user"] = str(reddit.user.me()) - - if "search" not in program_mode: - if program_mode["sort"] == "top" or program_mode["sort"] == "controversial": - keyword_params = {"time_filter": program_mode["time"], "limit": program_mode["limit"]} - # OTHER SORT TYPES DON'T TAKE TIME_FILTER - else: - keyword_params = {"limit": program_mode["limit"]} - else: - keyword_params = {"time_filter": program_mode["time"], "limit": program_mode["limit"]} - - if "search" in program_mode: - if program_mode["sort"] in ["hot", "rising", "controversial"]: - raise InvalidSortingType("Invalid sorting type has given") - - if "subreddit" in program_mode: - print( - "search for \"{search}\" in\n" - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - search=program_mode["search"], - limit=program_mode["limit"], - sort=program_mode["sort"], - subreddit=program_mode["subreddit"], - time=program_mode["time"] - ).upper(), no_print=True - ) - return extractDetails( - reddit.subreddit(program_mode["subreddit"]).search( - program_mode["search"], - limit=program_mode["limit"], - sort=program_mode["sort"], - time_filter=program_mode["time"] - ) - ) - - elif "multireddit" in program_mode: - raise NoPrawSupport("PRAW does not support that") - - elif "user" in program_mode: - raise NoPrawSupport("PRAW does not support that") - - elif "saved" in program_mode: - raise ("Reddit does not support that") - - if program_mode["sort"] == "relevance": - raise InvalidSortingType("Invalid sorting type has given") - - if "saved" in program_mode: - print("saved posts\nuser:{username}\nlimit={limit}\n".format( - username=reddit.user.me(), - limit=program_mode["limit"]).upper(), - no_print=True - ) - return extractDetails(reddit.user.me().saved(limit=program_mode["limit"])) - - if "subreddit" in program_mode: - - if program_mode["subreddit"] == "frontpage": - print( - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=program_mode["limit"], - sort=program_mode["sort"], - subreddit=program_mode["subreddit"], - time=program_mode["time"]).upper(), - no_print=True - ) - return extractDetails(getattr(reddit.front, program_mode["sort"])(**keyword_params)) - - else: - print( - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=program_mode["limit"], - sort=program_mode["sort"], - subreddit=program_mode["subreddit"], - time=program_mode["time"]).upper(), - no_print=True - ) - return extractDetails( - getattr(reddit.subreddit(program_mode["subreddit"]), program_mode["sort"])(**keyword_params) - ) - print( - "subreddit: {subreddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=programMode["limit"], - sort=programMode["sort"], - subreddit=programMode["subreddit"], - time=programMode["time"] - ).upper(), noPrint=True - ) - return extractDetails( - getattr( - reddit.subreddit(programMode["subreddit"]), programMode["sort"] - )(**keyword_params) - ) - - elif "multireddit" in program_mode: - print( - "user: {user}\n" - "multireddit: {multireddit}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - user=program_mode["user"], - limit=program_mode["limit"], - sort=program_mode["sort"], - multireddit=program_mode["multireddit"], - time=program_mode["time"]).upper(), - no_print=True - ) - try: - return extractDetails( - getattr(reddit.multireddit(program_mode["user"], program_mode["multireddit"]), - program_mode["sort"] - )(**keyword_params) - ) - except NotFound: - raise MultiredditNotFound("Multireddit not found") - - elif "submitted" in program_mode: - print( - "submitted posts of {user}\nsort: {sort}\n" - "time: {time}\nlimit: {limit}\n".format( - limit=program_mode["limit"], - sort=program_mode["sort"], - user=program_mode["user"], - time=program_mode["time"]).upper(), - no_print=True - ) - return extractDetails( - getattr(reddit.redditor(program_mode["user"]).submissions, program_mode["sort"])(**keyword_params) - ) - - elif "upvoted" in program_mode: - print( - "upvoted posts of {user}\nlimit: {limit}\n".format( - user=program_mode["user"], - limit=program_mode["limit"]).upper(), - no_print=True - ) - try: - return extractDetails(reddit.redditor(program_mode["user"]).upvoted(limit=program_mode["limit"])) - except Forbidden: - raise InsufficientPermission( - "You do not have permission to do that") - - elif "post" in program_mode: - print("post: {post}\n".format(post=program_mode["post"]).upper(), no_print=True) - return extractDetails(reddit.submission(url=program_mode["post"]), single_post=True) - - -def extractDetails(posts: (ListingGenerator, Submission), single_post=False) -> list[dict]: - """Check posts and decide if it can be downloaded. - If so, create a dictionary with post details and append them to a list. - Write all of posts to file. Return the list - """ - post_list = [] - post_count = 1 - - all_posts = {} - - print("\nGETTING POSTS") - posts_file = createLogFile("POSTS") - - if single_post: - submission = posts - post_count += 1 - try: - details = {'POSTID': submission.id, - 'TITLE': submission.title, - 'REDDITOR': str(submission.author), - 'TYPE': None, - 'CONTENTURL': submission.url, - 'SUBREDDIT': submission.subreddit.display_name, - 'UPVOTES': submission.score, - 'FLAIR': submission.link_flair_text, - 'DATE': str(time.strftime("%Y-%m-%d_%H-%M", time.localtime(submission.created_utc))) - } - except AttributeError: - pass - - if not any( - domain in submission.domain for domain in GLOBAL.arguments.skip_domain): - result = matchWithDownloader(submission) - - if result is not None: - details = {**details, **result} - post_list.append(details) - posts_file.add({post_count: details}) - - else: - try: - for submission in posts: - if post_count % 100 == 0: - sys.stdout.write("• ") - sys.stdout.flush() - - if post_count % 1000 == 0: - sys.stdout.write("\n" + " " * 14) - sys.stdout.flush() - - try: - details = {'POSTID': submission.id, - 'TITLE': submission.title, - 'REDDITOR': str(submission.author), - 'TYPE': None, - 'CONTENTURL': submission.url, - 'SUBREDDIT': submission.subreddit.display_name, - 'UPVOTES': submission.score, - 'FLAIR': submission.link_flair_text, - 'DATE': str(time.strftime("%Y-%m-%d_%H-%M", time.localtime(submission.created_utc))) - } - except AttributeError: - continue - - if details['POSTID'] in GLOBAL.downloadedPosts(): - continue - - if not any( - domain in submission.domain for domain in GLOBAL.arguments.skip_domain): - result = matchWithDownloader(submission) - - if result is not None: - details = {**details, **result} - post_list.append(details) - - all_posts[post_count] = details - post_count += 1 - - except KeyboardInterrupt: - print("\nKeyboardInterrupt", no_print=True) - - posts_file.add(all_posts) - - if not len(post_list) == 0: - print() - return post_list - else: - raise NoMatchingSubmissionFound("No matching submission was found") - - -def matchWithDownloader(submission: Submission) -> dict[str, str]: - direct_link = extractDirectLink(submission.url) - if direct_link: - return {'TYPE': 'direct', 'CONTENTURL': direct_link} - - if 'v.redd.it' in submission.domain: - bitrates = ["DASH_1080", "DASH_720", "DASH_600", "DASH_480", "DASH_360", "DASH_240"] - - for bitrate in bitrates: - video_url = submission.url + "/" + bitrate + ".mp4" - - try: - response_code = urllib.request.urlopen(video_url).getcode() - except urllib.error.HTTPError: - response_code = 0 - - if response_code == 200: - return {'TYPE': 'v.redd.it', 'CONTENTURL': video_url} - - if 'gfycat' in submission.domain: - return {'TYPE': 'gfycat'} - - if 'youtube' in submission.domain and 'watch' in submission.url: - return {'TYPE': 'youtube'} - - if 'youtu.be' in submission.domain: - url = urllib.request.urlopen(submission.url).geturl() - if 'watch' in url: - return {'TYPE': 'youtube'} - - elif 'imgur' in submission.domain: - return {'TYPE': 'imgur'} - - elif 'erome' in submission.domain: - return {'TYPE': 'erome'} - - elif 'redgifs' in submission.domain: - return {'TYPE': 'redgifs'} - - elif 'gifdeliverynetwork' in submission.domain: - return {'TYPE': 'gifdeliverynetwork'} - - if 'reddit.com/gallery' in submission.url: - return {'TYPE': 'gallery'} - - elif submission.is_self and 'self' not in GLOBAL.arguments.skip: - return {'TYPE': 'self', - 'CONTENT': submission.selftext} - - -def extractDirectLink(url: str) -> (bool, str): - """Check if link is a direct image link. - If so, return URL, - if not, return False - """ - image_types = ['jpg', 'jpeg', 'png', 'mp4', 'webm', 'gif'] - if url[-1] == "/": - url = url[:-1] - - if "i.reddituploads.com" in url: - return url - - for extension in image_types: - if extension == url.split(".")[-1]: - return url - else: - return None diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index 7873db7..b3fb7e6 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -1,105 +1,46 @@ #!/usr/bin/env python3 # coding=utf-8 -import hashlib import logging -import re from abc import ABC, abstractmethod from pathlib import Path import requests +from praw.models import Submission -from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip -from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.errors import FailedToDownload +from bulkredditdownloader.resource import Resource logger = logging.getLogger(__name__) class BaseDownloader(ABC): - def __init__(self, directory: Path, post: dict): + def __init__(self, directory: Path, post: Submission): self.directory = directory self.post = post + self.hashes = [] @abstractmethod - def download(self): + def download(self) -> list[Resource]: raise NotImplementedError - @staticmethod - def _create_hash(content: bytes) -> str: - hash_md5 = hashlib.md5(content) - return hash_md5.hexdigest() - - @staticmethod - def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False): - formats = { - "videos": [".mp4", ".webm"], - "images": [".jpg", ".jpeg", ".png", ".bmp"], - "gifs": [".gif"], - "self": [] + def _download_resource(self, resource_url: str): + headers = { + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " + "Safari/537.36 OPR/54.0.2952.64", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", + "Accept-Encoding": "none", + "Accept-Language": "en-US,en;q=0.8", + "Connection": "keep-alive", } - - for file_type in GLOBAL.arguments.skip: - for extension in formats[file_type]: - if extension in filename: - raise TypeInSkip - - if any(domain in image_url for domain in GLOBAL.arguments.skip_domain): - raise DomainInSkip - - headers = [ - ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " - "Safari/537.36 OPR/54.0.2952.64"), - ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"), - ("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"), - ("Accept-Encoding", "none"), - ("Accept-Language", "en-US,en;q=0.8"), - ("Connection", "keep-alive") - ] - - folder_dir.mkdir(exist_ok=True) - - if "imgur" not in image_url: - addheaders = headers - else: - addheaders = None - - if not silent: - logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n") - # Loop to attempt download 3 times for i in range(3): - file_path = Path(folder_dir) / filename - - if file_path.is_file(): - raise FileAlreadyExistsError - else: - try: - download_content = requests.get(image_url, headers=addheaders).content - except ConnectionResetError: - raise FailedToDownload - - file_hash = BaseDownloader._create_hash(download_content) - if GLOBAL.arguments.no_dupes: - if file_hash in GLOBAL.downloadedPosts(): - raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(file_hash) - - with open(file_path, 'wb') as file: - file.write(download_content) - if not silent: - logger.info(" " * indent + "Downloaded" + " " * 10) - return + try: + download_content = requests.get(resource_url, headers=headers).content + except ConnectionResetError: + raise FailedToDownload + return Resource(self.post, resource_url, download_content) raise FailedToDownload - - @staticmethod - def _get_extension(url: str) -> str: - pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))') - if results := re.search(pattern, url): - if len(results.groups()) > 1: - return results[0] - if "v.redd.it" not in url: - return '.jpg' - else: - return '.mp4' diff --git a/bulkredditdownloader/site_downloaders/direct.py b/bulkredditdownloader/site_downloaders/direct.py index 95ac00f..cb90752 100644 --- a/bulkredditdownloader/site_downloaders/direct.py +++ b/bulkredditdownloader/site_downloaders/direct.py @@ -2,18 +2,14 @@ import pathlib +from praw.models import Submission + from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.utils import GLOBAL class Direct(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - self.post['EXTENSION'] = self._get_extension(self.post['CONTENTURL']) - self.directory.mkdir(exist_ok=True) - - filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] - self._download_resource(pathlib.Path(filename), self.directory, self.post['CONTENTURL']) + return [self._download_resource(self.post.url)] diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 540733f..84ee3c9 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -7,77 +7,39 @@ import urllib.error import urllib.request from html.parser import HTMLParser +from praw.models import Submission + +from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class Erome(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): try: - images = self._get_links(self.post['CONTENTURL']) + images = self._get_links(self.post.url) except urllib.error.HTTPError: raise NotADownloadableLinkError("Not a downloadable link") - images_length = len(images) - how_many_downloaded = len(images) - duplicates = 0 - - if images_length == 1: - """Filenames are declared here""" - filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] + if len(images) == 1: image = images[0] if not re.match(r'https?://.*', image): image = "https://" + image - - self._download_resource(filename, self.directory, image) + return [self._download_resource(image)] else: - filename = GLOBAL.config['filename'].format(**self.post) - logger.info(filename) - - folder_dir = self.directory / filename - - folder_dir.mkdir(exist_ok=True) - + out = [] for i, image in enumerate(images): - extension = self._get_extension(image) - filename = str(i + 1) + extension - if not re.match(r'https?://.*', image): image = "https://" + image - logger.info(" ({}/{})".format(i + 1, images_length)) - logger.info(" {}".format(filename)) - - try: - self._download_resource(pathlib.Path(filename), folder_dir, image, indent=2) - except FileAlreadyExistsError: - logger.info(" The file already exists" + " " * 10, end="\n\n") - duplicates += 1 - how_many_downloaded -= 1 - - except Exception as exception: - # raise exception - logger.error("\n Could not get the file") - logger.error( - " " - + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception)) - + "\n" - ) - how_many_downloaded -= 1 - - if duplicates == images_length: - raise FileAlreadyExistsError - elif how_many_downloaded + duplicates < images_length: - raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") + out.append(self._download_resource(image)) + return out @staticmethod def _get_links(url: str) -> list[str]: diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 59334be..7a4c732 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -1,26 +1,23 @@ #!/usr/bin/env python3 import json -import pathlib import logging -import urllib.parse +import pathlib import requests +from praw.models import Submission +from bulkredditdownloader.errors import ImageNotFound, NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, - NotADownloadableLinkError, TypeInSkip) -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class Gallery(BaseDownloader): - def __init__(self, directory: pathlib.Path, post): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - link = self.post['CONTENTURL'] + link = self.post.url self.raw_data = self._get_data(link) - self.download() def download(self): images = {} @@ -37,7 +34,7 @@ class Gallery(BaseDownloader): except KeyError: continue - self._download_album(images, count) + return [self._download_album(images)] @staticmethod def _get_data(link: str) -> dict: @@ -63,44 +60,9 @@ class Gallery(BaseDownloader): data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1]) return data - def _download_album(self, images: dict, count: int): - folder_name = GLOBAL.config['filename'].format(**self.post) - folder_dir = self.directory / folder_name - - how_many_downloaded = 0 - duplicates = 0 - - folder_dir.mkdir(exist_ok=True) - logger.info(folder_name) - + def _download_album(self, images: dict): + out = [] for i, image in enumerate(images): - path = urllib.parse.urlparse(image['url']).path - extension = pathlib.Path(path).suffix + out.append(self._download_resource(image['url'])) - filename = pathlib.Path("_".join([str(i + 1), image['id']]) + extension) - - logger.info("\n ({}/{})".format(i + 1, count)) - - try: - self._download_resource(filename, folder_dir, image['url'], indent=2) - how_many_downloaded += 1 - - except FileAlreadyExistsError: - logger.info(" The file already exists" + " " * 10, end="\n\n") - duplicates += 1 - - except TypeInSkip: - logger.info(" Skipping...") - how_many_downloaded += 1 - - except Exception as exception: - logger.info("\n Could not get the file") - logger.info(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exception.__class__.__name__, info=str(exception)) + "\n" - ) - logger.info(GLOBAL.log_stream.getvalue(), no_print=True) - - if duplicates == count: - raise FileAlreadyExistsError - elif how_many_downloaded + duplicates < count: - raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") + return out diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index bd1d694..1bc442d 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -6,14 +6,14 @@ import re import urllib.request from bs4 import BeautifulSoup +from praw.models import Submission from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork class Gfycat(GifDeliveryNetwork): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): super().download() diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 85252cb..ba84695 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -4,29 +4,23 @@ import pathlib import urllib.request from bs4 import BeautifulSoup +from praw.models import Submission -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.errors import NotADownloadableLinkError -from bulkredditdownloader.utils import GLOBAL +from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader class GifDeliveryNetwork(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): try: - self.post['MEDIAURL'] = self._get_link(self.post['CONTENTURL']) + media_url = self._get_link(self.post.url) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - self.post['EXTENSION'] = self._get_extension(self.post['MEDIAURL']) - self.directory.mkdir(exist_ok=True) - - filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"] - - self._download_resource(filename, self.directory, self.post['MEDIAURL']) + return [self._download_resource(media_url)] @staticmethod def _get_link(url: str) -> str: diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index b1c2016..d821121 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -1,16 +1,15 @@ #!/usr/bin/env python3 import json -import pathlib import logging +import pathlib import requests +from praw.models import Submission +from bulkredditdownloader.errors import ExtensionError, ImageNotFound, NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct -from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, - ImageNotFound, NotADownloadableLinkError, TypeInSkip) -from bulkredditdownloader.utils import GLOBAL, nameCorrector logger = logging.getLogger(__name__) @@ -19,85 +18,43 @@ class Imgur(BaseDownloader): imgur_image_domain = "https://i.imgur.com/" - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) self.raw_data = {} - self.download() def download(self): - link = self.post['CONTENTURL'] + link = self.post.url if link.endswith(".gifv"): - link = link.replace(".gifv", ".mp4") - Direct(self.directory, {**self.post, 'CONTENTURL': link}) - return + direct_thing = Direct(self.directory, self.post) + return direct_thing.download() self.raw_data = self._get_data(link) if self._is_album: if self.raw_data["album_images"]["count"] != 1: - self._download_album(self.raw_data["album_images"]) + out = self._download_album(self.raw_data["album_images"]) else: - self._download_image(self.raw_data["album_images"]["images"][0]) + out = self._download_image(self.raw_data["album_images"]["images"][0]) else: - self._download_image(self.raw_data) + out = self._download_image(self.raw_data) + return out def _download_album(self, images: dict): - folder_name = GLOBAL.config['filename'].format(**self.post) - folder_dir = self.directory / folder_name - images_length = images["count"] - how_many_downloaded = 0 - duplicates = 0 - folder_dir.mkdir(exist_ok=True) - logger.info(folder_name) + out = [] for i in range(images_length): extension = self._validate_extension(images["images"][i]["ext"]) image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension - filename = pathlib.Path("_".join([str(i + 1), - nameCorrector(images["images"][i]['title']), - images["images"][i]['hash']]) + extension) - - logger.info("\n ({}/{})".format(i + 1, images_length)) - - try: - self._download_resource(filename, folder_dir, image_url, indent=2) - how_many_downloaded += 1 - - except FileAlreadyExistsError: - logger.info(" The file already exists" + " " * 10, end="\n\n") - duplicates += 1 - - except TypeInSkip: - logger.info(" Skipping...") - how_many_downloaded += 1 - - except Exception as exception: - logger.info("\n Could not get the file") - logger.info( - " " - + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format( - class_name=exception.__class__.__name__, - info=str(exception) - ) - + "\n" - ) - logger.info(GLOBAL.log_stream.getvalue(), no_print=True) - - if duplicates == images_length: - raise FileAlreadyExistsError - elif how_many_downloaded + duplicates < images_length: - raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely") + out.append(self._download_resource(image_url)) + return out def _download_image(self, image: dict): extension = self._validate_extension(image["ext"]) image_url = self.imgur_image_domain + image["hash"] + extension - - filename = GLOBAL.config['filename'].format(**self.post) + extension - - self._download_resource(filename, self.directory, image_url) + return [self._download_resource(image_url)] def _is_album(self) -> bool: return "album_images" in self.raw_data @@ -134,9 +91,8 @@ class Imgur(BaseDownloader): @staticmethod def _validate_extension(extension_suffix: str) -> str: possible_extensions = [".jpg", ".png", ".mp4", ".gif"] - for extension in possible_extensions: if extension in extension_suffix: return extension else: - raise ExtensionError(f"\"{extension_suffix}\" is not recognized as a valid extension.") + raise ExtensionError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur') diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 2f5f520..2c109d7 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -5,24 +5,22 @@ import pathlib import urllib.request from bs4 import BeautifulSoup +from praw.models import Submission -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork class Redgifs(GifDeliveryNetwork): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): super().download() @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source - and return it - """ + """Extract direct link to the video from page's source and return it""" if '.webm' in url or '.mp4' in url or '.gif' in url: return url diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index c94df7e..05f576e 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -1,64 +1,39 @@ #!/usr/bin/env python3 -import io import logging import pathlib -from pathlib import Path +from praw.models import Submission + +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class SelfPost(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - if "self" in GLOBAL.arguments.skip: - raise TypeInSkip + return Resource(self.post, self.post.url, bytes(self.export_to_string())) - self.directory.mkdir(exist_ok=True) - filename = GLOBAL.config['filename'].format(**self.post) - - file_dir = self.directory / (filename + ".md") - logger.info(file_dir) - logger.info(filename + ".md") - - if Path.is_file(file_dir): - raise FileAlreadyExistsError - - try: - self._write_to_file(file_dir, self.post) - except FileNotFoundError: - file_dir = self.post['POSTID'] + ".md" - file_dir = self.directory / file_dir - - self._write_to_file(file_dir, self.post) - - @staticmethod - def _write_to_file(directory: pathlib.Path, post: dict): + def export_to_string(self) -> str: """Self posts are formatted here""" content = ("## [" - + post["TITLE"] + + self.post.fullname + "](" - + post["CONTENTURL"] + + self.post.url + ")\n" - + post["CONTENT"] + + self.post.selftext + "\n\n---\n\n" + "submitted to [r/" - + post["SUBREDDIT"] + + self.post.subreddit.title + "](https://www.reddit.com/r/" - + post["SUBREDDIT"] + + self.post.subreddit.title + ") by [u/" - + post["REDDITOR"] + + self.post.author.name + "](https://www.reddit.com/user/" - + post["REDDITOR"] + + self.post.author.name + ")") - - with io.open(directory, "w", encoding="utf-8") as FILE: - print(content, file=FILE) - logger.info("Downloaded") + return content diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py index 2b4ee03..d13bece 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -4,61 +4,49 @@ import logging import os import pathlib import subprocess +import tempfile +import requests +from praw.models import Submission + +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class VReddit(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - extension = ".mp4" - self.directory.mkdir(exist_ok=True) - - filename = GLOBAL.config['filename'].format(**self.post) + extension - try: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) - except Exception: - self._download_resource(filename, self.directory, self.post['CONTENTURL']) - logger.info("FFMPEG library not found, skipping merging video and audio") + except subprocess.SubprocessError: + return self._download_resource(self.post.url) else: - video_name = self.post['POSTID'] + "_video" - video_url = self.post['CONTENTURL'] - audio_name = self.post['POSTID'] + "_audio" + video_url = self.post.url audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' - logger.info(self.directory, filename, sep="\n") - - self._download_resource(video_name, self.directory, video_url, silent=True) - self._download_resource(audio_name, self.directory, audio_url, silent=True) - try: - self._merge_audio(video_name, audio_name, filename, self.directory) - except KeyboardInterrupt: - (self.directory / filename).unlink() - (self.directory / audio_name).unlink() - (self.directory / video_name).unlink() - (self.directory / filename).unlink() + with tempfile.TemporaryDirectory() as temp_dir: + video = requests.get(video_url).content + audio = requests.get(audio_url).content + with open(temp_dir / 'video', 'wb')as file: + file.write(video) + with open(temp_dir / 'audio', 'wb') as file: + file.write(audio) + self._merge_audio(temp_dir) + with open(temp_dir / 'output.mp4', 'rb') as file: + content = file.read() + return Resource(self.post, self.post.url, content) @staticmethod - def _merge_audio( - video: pathlib.Path, - audio: pathlib.Path, - filename: pathlib.Path, - directory: pathlib.Path): - input_video = str(directory / video) - input_audio = str(directory / audio) + def _merge_audio(working_directory: pathlib.Path): + input_video = working_directory / 'video' + input_audio = working_directory / 'audio' fnull = open(os.devnull, 'w') cmd = "ffmpeg -i {} -i {} -c:v copy -c:a aac -strict experimental {}".format( - input_audio, input_video, str(directory / filename)) + input_audio, input_video, str(working_directory / 'output.mp4')) subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT) - - (directory / video).unlink() - (directory / audio).unlink() diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index afabf66..b99b2a1 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -1,64 +1,37 @@ #!/usr/bin/env python3 import logging -import os import pathlib -import sys +import tempfile import youtube_dl +from praw.models import Submission +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.errors import FileAlreadyExistsError -from bulkredditdownloader.utils import GLOBAL logger = logging.getLogger(__name__) class Youtube(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: dict): + def __init__(self, directory: pathlib.Path, post: Submission): super().__init__(directory, post) - self.download() def download(self): - self.directory.mkdir(exist_ok=True) + return self._download_video() - filename = GLOBAL.config['filename'].format(**self.post) - logger.info(filename) + def _download_video(self) -> Resource: + with tempfile.TemporaryDirectory() as temp_dir: + ydl_opts = { + "format": "best", + "outtmpl": str(temp_dir / "test.%(ext)s"), + "playlistend": 1, + "nooverwrites": True, + "quiet": True + } + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download([self.post.url]) - self._download_video(filename, self.directory, self.post['CONTENTURL']) - - def _download_video(self, filename: str, directory: pathlib.Path, url: str): - ydl_opts = { - "format": "best", - "outtmpl": str(directory / (filename + ".%(ext)s")), - "progress_hooks": [self._hook], - "playlistend": 1, - "nooverwrites": True, - "quiet": True - } - with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download([url]) - - location = directory / (filename + ".mp4") - - with open(location, 'rb') as file: - content = file.read() - - if GLOBAL.arguments.no_dupes: - try: - file_hash = self._create_hash(content) - except FileNotFoundError: - return None - if file_hash in GLOBAL.downloadedPosts(): - os.remove(location) - raise FileAlreadyExistsError - GLOBAL.downloadedPosts.add(file_hash) - - @staticmethod - def _hook(d): - if d['status'] == 'finished': - return logger.info("Downloaded") - downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6))) - file_size = int(d['total_bytes'] * (10**(-6))) - sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size)) - sys.stdout.flush() + with open(temp_dir / 'test.mp4', 'rb') as file: + content = file.read() + return Resource(self.post, self.post.url, content) diff --git a/bulkredditdownloader/store.py b/bulkredditdownloader/store.py index 79cdf43..5aba94e 100644 --- a/bulkredditdownloader/store.py +++ b/bulkredditdownloader/store.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + from os import path diff --git a/bulkredditdownloader/tests/downloaders/test_base_downloader.py b/bulkredditdownloader/tests/downloaders/test_base_downloader.py index 951ef81..41c8335 100644 --- a/bulkredditdownloader/tests/downloaders/test_base_downloader.py +++ b/bulkredditdownloader/tests/downloaders/test_base_downloader.py @@ -1,42 +1,30 @@ -#!/uasr/bin/env python3 +#!/usr/bin/env python3 # coding=utf-8 from pathlib import Path +from unittest.mock import Mock import pytest +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -@pytest.mark.parametrize(('test_bytes', 'expected'), ((b'test', '098f6bcd4621d373cade4e832627b4f6'), - (b'test2', 'ad0234829205b9033196ba818f7a872b'))) -def test_create_hash(test_bytes: bytes, expected: str): - result = BaseDownloader._create_hash(test_bytes) - assert result == expected +class BlankDownloader(BaseDownloader): + def __init__(self, directory, post): + super().__init__(directory, post) + + def download(self) -> list[Resource]: + return [self._download_resource(self.post.url)] -@pytest.mark.parametrize(('test_url', 'expected'), (('test.png', '.png'), - ('random.jpg', '.jpg'), - ('http://random.com/test.png', '.png'), - ('https://example.net/picture.jpg', '.jpg'), - ('https://v.redd.it/picture', '.mp4'), - ('https://v.redd.it/picture.jpg', '.jpg'), - ('https:/random.url', '.jpg') - )) -def test_get_extension(test_url: str, expected: str): - result = BaseDownloader._get_extension(test_url) - assert result == expected - - -@pytest.mark.skip -@pytest.mark.parametrize(('test_url', 'expected_hash'), (('https://www.iana.org/_img/2013.1/iana-logo-header.svg', ''), - ('', '') - )) -def test_download_resource(test_url: str, expected_hash: str, tmp_path: Path): - test_file = tmp_path / 'test' - BaseDownloader._download_resource(test_file, tmp_path, test_url) - assert test_file.exists() - with open(test_file, 'rb') as file: - content = file.read() - hash_result = BaseDownloader._create_hash(content) - assert hash_result == expected_hash +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://docs.python.org/3/_static/py.png', 'a721fc7ec672275e257bbbfde49a4d4e'), +)) +def test_get_resource(test_url: str, expected_hash: str): + mock_submission = Mock + mock_submission.url = test_url + downloader = BlankDownloader(Path('.'), mock_submission) + result = downloader.download() + assert isinstance(result[0], Resource) + assert result[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/utils.py b/bulkredditdownloader/utils.py deleted file mode 100644 index f63f159..0000000 --- a/bulkredditdownloader/utils.py +++ /dev/null @@ -1,90 +0,0 @@ -import io -import sys -from os import makedirs, path -from pathlib import Path -from typing import Optional - -from bulkredditdownloader.json_helper import JsonFile - - -class GLOBAL: - """Declare global variables""" - RUN_TIME = "" - config = {'imgur_client_id': None, 'imgur_client_secret': None} - arguments = None - directory = None - defaultConfigDirectory = Path.home() / "Bulk Downloader for Reddit" - configDirectory = "" - reddit_client_id = "U-6gk4ZCh3IeNQ" - reddit_client_secret = "7CZHY6AmKweZME5s50SfDGylaPg" - printVanilla = print - log_stream = None - - @staticmethod - def downloadedPosts() -> list: - return [] - - -def createLogFile(title: str) -> JsonFile: - """Create a log file with given name - inside a folder time stampt in its name and - put given arguments inside \"HEADER\" key - """ - folder_directory = GLOBAL.directory / "LOG_FILES" / GLOBAL.RUN_TIME - - log_filename = title.upper() + '.json' - - if not path.exists(folder_directory): - makedirs(folder_directory) - - file = JsonFile(folder_directory / Path(log_filename)) - header = " ".join(sys.argv) - file.add({"HEADER": header}) - - return file - - -def printToFile(*args, no_print=False, **kwargs): - """Print to both CONSOLE and - CONSOLE LOG file in a folder time stampt in the name - """ - folder_directory = GLOBAL.directory / Path("LOG_FILES") / Path(GLOBAL.RUN_TIME) - - if not no_print or GLOBAL.arguments.verbose or "file" in kwargs: - print(*args, **kwargs) - - if not path.exists(folder_directory): - makedirs(folder_directory) - - if "file" not in kwargs: - with io.open(folder_directory / "CONSOLE_LOG.txt", "a", encoding="utf-8") as FILE: - print(*args, file=FILE, **kwargs) - - -def nameCorrector(string: str, reference: Optional[str] = None) -> str: - """Swap strange characters from given string - with underscore (_) and shorten it. - Return the string - """ - limit = 247 - string_length = len(string) - - if reference: - reference_length = len(reference) - total_lenght = reference_length - else: - total_lenght = string_length - - if total_lenght > limit: - limit -= reference_length - string = string[:limit - 1] - - string = string.replace(" ", "_") - - if len(string.split('\n')) > 1: - string = "".join(string.split('\n')) - - bad_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '#', '.', '@', '“', '’', '\'', '!'] - string = "".join([i if i not in bad_chars else "_" for i in string]) - - return string diff --git a/setup.py b/setup.py index ab78f46..a8c413f 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,15 @@ #!C:\Users\Ali\AppData\Local\Programs\Python\Python36\python.exe -## python setup.py build +# python setup.py build import sys -from cx_Freeze import setup, Executable + +from cx_Freeze import Executable, setup + from bulkredditdownloader.__main__ import __version__ options = { "build_exe": { - "packages":[ + "packages": [ "idna", "praw", "requests", "multiprocessing" ] } @@ -15,7 +17,7 @@ options = { if sys.platform == "win32": executables = [Executable( - "script.py", + "script.py", targetName="bulk-downloader-for-reddit.exe", shortcutName="Bulk Downloader for Reddit", shortcutDir="DesktopFolder" @@ -23,28 +25,26 @@ if sys.platform == "win32": elif sys.platform == "linux": executables = [Executable( - "script.py", + "script.py", targetName="bulk-downloader-for-reddit", shortcutName="Bulk Downloader for Reddit", shortcutDir="DesktopFolder" )] setup( - name = "Bulk Downloader for Reddit", - version = __version__, - description = "Bulk Downloader for Reddit", - author = "Ali Parlakci", + name="Bulk Downloader for Reddit", + version=__version__, + description="Bulk Downloader for Reddit", + author="Ali Parlakci", author_email="parlakciali@gmail.com", url="https://github.com/aliparlakci/bulk-downloader-for-reddit", classifiers=( - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU General Public License v3 (GPLv3)" - "Natural Language :: English", - "Environment :: Console", - "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)" + "Natural Language :: English", + "Environment :: Console", + "Operating System :: OS Independent", ), - executables = executables, - options = options + executables=executables, + options=options ) - - From 91ae9924c35e4cdbe6cfe338df0bb2770f745fa7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 16:31:28 +1000 Subject: [PATCH 018/276] Add search functionality --- bulkredditdownloader/downloader.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index fc10ee0..274d8d8 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -90,15 +90,18 @@ class RedditDownloader: def _get_subreddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: if args.subreddit: subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in args.subreddit] - if self.sort_filter is RedditTypes.SortType.NEW: - sort_function = praw.models.Subreddit.new - elif self.sort_filter is RedditTypes.SortType.RISING: - sort_function = praw.models.Subreddit.rising - elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: - sort_function = praw.models.Subreddit.controversial + if args.search: + return [reddit.search(args.search, sort=self.sort_filter.name.lower()) for reddit in subreddits] else: - sort_function = praw.models.Subreddit.hot - return [sort_function(reddit) for reddit in subreddits] + if self.sort_filter is RedditTypes.SortType.NEW: + sort_function = praw.models.Subreddit.new + elif self.sort_filter is RedditTypes.SortType.RISING: + sort_function = praw.models.Subreddit.rising + elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: + sort_function = praw.models.Subreddit.controversial + else: + sort_function = praw.models.Subreddit.hot + return [sort_function(reddit) for reddit in subreddits] else: return [] From 64bc10f6aaeea4ed79a42ec3a6c2686138f9ae81 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Feb 2021 16:31:38 +1000 Subject: [PATCH 019/276] Add ARCHITECTURE file --- ARCHITECTURE.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 ARCHITECTURE.md diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..8f1bb5e --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,22 @@ +# Architecture + + 1. Arguments are passed to an instance of RedditDownloader + 2. Internal objects are created + + - Formatter created + - Filter created + - Configuration loaded + - Reddit instance created + + 3. Reddit lists scraped + +To actually download, the following happens: + + 1. RedditDownloader uses DownloadFactory to find the right module for a submission + 2. Downloader instance created + 3. Downloader returns a list of Resource objects (lists may have one objects) + 4. RedditDownloader checks if it already exists + 5. RedditDownloader checks against the DownloadFilter created earlier + 6. RedditDownloader creates a formatted file path base on the Resource with FileNameFormatter + 7. Resource content is written to disk + \ No newline at end of file From 722e6cb73a5f83d3e98b0dd6473b6359b7c525f3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Feb 2021 15:52:11 +1000 Subject: [PATCH 020/276] Add logging to file --- bulkredditdownloader/downloader.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 274d8d8..bc574f8 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -52,6 +52,7 @@ class RedditDownloader: self.sort_filter = RedditDownloader._create_sort_filter(args) self.file_name_formatter = RedditDownloader._create_file_name_formatter(args) self._determine_directories(args) + self._create_file_logger() self.master_hash_list = [] self._load_config(args) if self.cfg_parser.has_option('DEFAULT', 'username') and self.cfg_parser.has_option('DEFAULT', 'password'): @@ -87,6 +88,15 @@ class RedditDownloader: else: self.cfg_parser.read(Path('./default_config.cfg').resolve()) + def _create_file_logger(self): + main_logger = logging.getLogger() + file_handler = logging.FileHandler(self.logfile_directory) + formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') + file_handler.setFormatter(formatter) + file_handler.setLevel(0) + + main_logger.addHandler(file_handler) + def _get_subreddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: if args.subreddit: subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in args.subreddit] From e646ae4a843a160a9d1a064bfe780a054ea7bd1d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Feb 2021 18:52:04 +1000 Subject: [PATCH 021/276] Add existence checking --- bulkredditdownloader/downloader.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index bc574f8..532bd2f 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -176,7 +176,6 @@ class RedditDownloader: self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): - # TODO: check existence here if self.download_filter.check_url(submission.url): try: downloader_class = DownloadFactory.pull_lever(submission.url) @@ -184,13 +183,17 @@ class RedditDownloader: content = downloader.download() for res in content: destination = self.file_name_formatter.format_path(res, self.download_directory) - if res.hash.hexdigest() not in self.master_hash_list: - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug('Written file to {}'.format(destination)) - self.master_hash_list.append(res.hash.hexdigest()) - logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) + if destination.exists(): + logger.debug('File already exists: {}'.format(destination)) + else: + if res.hash.hexdigest() not in self.master_hash_list: + # TODO: consider making a hard link/symlink here + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug('Written file to {}'.format(destination)) + self.master_hash_list.append(res.hash.hexdigest()) + logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) logger.info('Downloaded submission {}'.format(submission.name)) except NotADownloadableLinkError as e: From 70a992d299d20224864b37f896a688a49cde292c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Feb 2021 19:04:20 +1000 Subject: [PATCH 022/276] Move args to instance variable --- bulkredditdownloader/downloader.py | 89 +++++++++++++++--------------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 532bd2f..32e3e95 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -40,21 +40,22 @@ class RedditTypes: class RedditDownloader: def __init__(self, args: argparse.Namespace): + self.args = args self.config_directories = appdirs.AppDirs('bulk_reddit_downloader') self.run_time = datetime.now().isoformat() - self._setup_internal_objects(args) + self._setup_internal_objects() - self.reddit_lists = self._retrieve_reddit_lists(args) + self.reddit_lists = self._retrieve_reddit_lists() - def _setup_internal_objects(self, args: argparse.Namespace): - self.download_filter = RedditDownloader._create_download_filter(args) - self.time_filter = RedditDownloader._create_time_filter(args) - self.sort_filter = RedditDownloader._create_sort_filter(args) - self.file_name_formatter = RedditDownloader._create_file_name_formatter(args) - self._determine_directories(args) + def _setup_internal_objects(self): + self.download_filter = self._create_download_filter() + self.time_filter = self._create_time_filter() + self.sort_filter = self._create_sort_filter() + self.file_name_formatter = self._create_file_name_formatter() + self._determine_directories() self._create_file_logger() self.master_hash_list = [] - self._load_config(args) + self._load_config() if self.cfg_parser.has_option('DEFAULT', 'username') and self.cfg_parser.has_option('DEFAULT', 'password'): self.authenticated = True @@ -69,21 +70,21 @@ class RedditDownloader: client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), user_agent=socket.gethostname()) - def _retrieve_reddit_lists(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: + def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: master_list = [] - master_list.extend(self._get_subreddits(args)) - master_list.extend(self._get_multireddits(args)) - master_list.extend(self._get_user_data(args)) + master_list.extend(self._get_subreddits()) + master_list.extend(self._get_multireddits()) + master_list.extend(self._get_user_data()) return master_list - def _determine_directories(self, args: argparse.Namespace): - self.download_directory = Path(args.directory) + def _determine_directories(self): + self.download_directory = Path(self.args.directory) self.logfile_directory = self.download_directory / 'LOG_FILES' self.config_directory = self.config_directories.user_config_dir - def _load_config(self, args: argparse.Namespace): + def _load_config(self): self.cfg_parser = configparser.ConfigParser() - if args.use_local_config and Path('./config.cfg').exists(): + if self.args.use_local_config and Path('./config.cfg').exists(): self.cfg_parser.read(Path('./config.cfg')) else: self.cfg_parser.read(Path('./default_config.cfg').resolve()) @@ -97,11 +98,11 @@ class RedditDownloader: main_logger.addHandler(file_handler) - def _get_subreddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: - if args.subreddit: - subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in args.subreddit] - if args.search: - return [reddit.search(args.search, sort=self.sort_filter.name.lower()) for reddit in subreddits] + def _get_subreddits(self) -> list[praw.models.ListingGenerator]: + if self.args.subreddit: + subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in self.args.subreddit] + if self.args.search: + return [reddit.search(self.args.search, sort=self.sort_filter.name.lower()) for reddit in subreddits] else: if self.sort_filter is RedditTypes.SortType.NEW: sort_function = praw.models.Subreddit.new @@ -115,25 +116,25 @@ class RedditDownloader: else: return [] - def _get_multireddits(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: - if args.multireddit: + def _get_multireddits(self) -> list[praw.models.ListingGenerator]: + if self.args.multireddit: if self.authenticated: - return [self.reddit_instance.multireddit(m_reddit_choice) for m_reddit_choice in args.multireddit] + return [self.reddit_instance.multireddit(m_reddit_choice) for m_reddit_choice in self.args.multireddit] else: raise RedditAuthenticationError('Accessing multireddits requires authentication') else: return [] - def _get_user_data(self, args: argparse.Namespace) -> list[praw.models.ListingGenerator]: - if any((args.upvoted, args.submitted, args.saved)): + def _get_user_data(self) -> list[praw.models.ListingGenerator]: + if any((self.args.upvoted, self.args.submitted, self.args.saved)): if self.authenticated: generators = [] - if args.upvoted: - generators.append(self.reddit_instance.redditor(args.user).upvoted) - if args.submitted: - generators.append(self.reddit_instance.redditor(args.user).submissions) - if args.saved: - generators.append(self.reddit_instance.redditor(args.user).saved) + if self.args.upvoted: + generators.append(self.reddit_instance.redditor(self.args.user).upvoted) + if self.args.submitted: + generators.append(self.reddit_instance.redditor(self.args.user).submissions) + if self.args.saved: + generators.append(self.reddit_instance.redditor(self.args.user).saved) return generators else: @@ -141,34 +142,30 @@ class RedditDownloader: else: return [] - @staticmethod - def _create_file_name_formatter(args: argparse.Namespace) -> FileNameFormatter: - return FileNameFormatter(args.set_filename, args.set_folderpath) + def _create_file_name_formatter(self) -> FileNameFormatter: + return FileNameFormatter(self.args.set_filename, self.args.set_folderpath) - @staticmethod - def _create_time_filter(args: argparse.Namespace) -> RedditTypes.TimeType: + def _create_time_filter(self) -> RedditTypes.TimeType: try: - return RedditTypes.TimeType[args.sort.upper()] + return RedditTypes.TimeType[self.args.sort.upper()] except (KeyError, AttributeError): return RedditTypes.TimeType.ALL - @staticmethod - def _create_sort_filter(args: argparse.Namespace) -> RedditTypes.SortType: + def _create_sort_filter(self) -> RedditTypes.SortType: try: - return RedditTypes.SortType[args.time.upper()] + return RedditTypes.SortType[self.args.time.upper()] except (KeyError, AttributeError): return RedditTypes.SortType.HOT - @staticmethod - def _create_download_filter(args: argparse.Namespace) -> DownloadFilter: + def _create_download_filter(self) -> DownloadFilter: formats = { "videos": [".mp4", ".webm"], "images": [".jpg", ".jpeg", ".png", ".bmp"], "gifs": [".gif"], "self": [] } - excluded_extensions = [extension for ext_type in args.skip for extension in formats.get(ext_type, ())] - return DownloadFilter(excluded_extensions, args.skip_domain) + excluded_extensions = [extension for ext_type in self.args.skip for extension in formats.get(ext_type, ())] + return DownloadFilter(excluded_extensions, self.args.skip_domain) def download(self): for generator in self.reddit_lists: From eac2381a0aba3d40ddda84377042030212e73295 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Feb 2021 19:09:18 +1000 Subject: [PATCH 023/276] Re-implement --no-download flag --- bulkredditdownloader/downloader.py | 31 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 32e3e95..835ec99 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -177,20 +177,23 @@ class RedditDownloader: try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(self.download_directory, submission) - content = downloader.download() - for res in content: - destination = self.file_name_formatter.format_path(res, self.download_directory) - if destination.exists(): - logger.debug('File already exists: {}'.format(destination)) - else: - if res.hash.hexdigest() not in self.master_hash_list: - # TODO: consider making a hard link/symlink here - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug('Written file to {}'.format(destination)) - self.master_hash_list.append(res.hash.hexdigest()) - logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) + if self.args.no_download: + logger.info('Skipping download for submission {}'.format(submission.id)) + else: + content = downloader.download() + for res in content: + destination = self.file_name_formatter.format_path(res, self.download_directory) + if destination.exists(): + logger.debug('File already exists: {}'.format(destination)) + else: + if res.hash.hexdigest() not in self.master_hash_list: + # TODO: consider making a hard link/symlink here + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug('Written file to {}'.format(destination)) + self.master_hash_list.append(res.hash.hexdigest()) + logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) logger.info('Downloaded submission {}'.format(submission.name)) except NotADownloadableLinkError as e: From 4bbe41a2f844a78460f01073964990025556ca93 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Feb 2021 19:22:31 +1000 Subject: [PATCH 024/276] Re-implement --no-dupes flag --- bulkredditdownloader/downloader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 835ec99..9723668 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -186,7 +186,7 @@ class RedditDownloader: if destination.exists(): logger.debug('File already exists: {}'.format(destination)) else: - if res.hash.hexdigest() not in self.master_hash_list: + if res.hash.hexdigest() not in self.master_hash_list and self.args.no_dupes: # TODO: consider making a hard link/symlink here destination.parent.mkdir(parents=True, exist_ok=True) with open(destination, 'wb') as file: @@ -194,6 +194,8 @@ class RedditDownloader: logger.debug('Written file to {}'.format(destination)) self.master_hash_list.append(res.hash.hexdigest()) logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) + else: + logger.debug(f'Resource from {res.url} downloaded elsewhere') logger.info('Downloaded submission {}'.format(submission.name)) except NotADownloadableLinkError as e: From 866d94f37ecac74bc1124282475617fa497206a2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 13:53:21 +1000 Subject: [PATCH 025/276] Add gallery to factory --- bulkredditdownloader/site_downloaders/download_factory.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 64ebc1b..dfe2b2d 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -8,6 +8,7 @@ from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct from bulkredditdownloader.site_downloaders.erome import Erome +from bulkredditdownloader.site_downloaders.gallery import Gallery from bulkredditdownloader.site_downloaders.gfycat import Gfycat from bulkredditdownloader.site_downloaders.imgur import Imgur from bulkredditdownloader.site_downloaders.redgifs import Redgifs @@ -27,5 +28,7 @@ class DownloadFactory: return Redgifs elif re.match(url_beginning + r'[vi].redd\.it.*', url): return Direct + elif re.match(url_beginning + r'reddit.com/gallery/.*', url): + return Gallery else: raise NotADownloadableLinkError('No downloader module exists for url {}'.format(url)) From 8e54986357a76c6a03077ffb42876f9f5f6744c1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 13:55:33 +1000 Subject: [PATCH 026/276] Re-implement --limit option --- bulkredditdownloader/__main__.py | 1 + bulkredditdownloader/downloader.py | 28 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 77c0088..678e4fe 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -76,6 +76,7 @@ def _add_options(): parser.add_argument("--limit", help="default: unlimited", metavar="Limit", + default=None, type=int) parser.add_argument("--time", help="Either hour, day, week, month, year or all. default: all", diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 9723668..b99066d 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -104,18 +104,22 @@ class RedditDownloader: if self.args.search: return [reddit.search(self.args.search, sort=self.sort_filter.name.lower()) for reddit in subreddits] else: - if self.sort_filter is RedditTypes.SortType.NEW: - sort_function = praw.models.Subreddit.new - elif self.sort_filter is RedditTypes.SortType.RISING: - sort_function = praw.models.Subreddit.rising - elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: - sort_function = praw.models.Subreddit.controversial - else: - sort_function = praw.models.Subreddit.hot - return [sort_function(reddit) for reddit in subreddits] + sort_function = self._determine_sort_function() + return [sort_function(reddit, limit=self.args.limit) for reddit in subreddits] else: return [] + def _determine_sort_function(self): + if self.sort_filter is RedditTypes.SortType.NEW: + sort_function = praw.models.Subreddit.new + elif self.sort_filter is RedditTypes.SortType.RISING: + sort_function = praw.models.Subreddit.rising + elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: + sort_function = praw.models.Subreddit.controversial + else: + sort_function = praw.models.Subreddit.hot + return sort_function + def _get_multireddits(self) -> list[praw.models.ListingGenerator]: if self.args.multireddit: if self.authenticated: @@ -129,10 +133,14 @@ class RedditDownloader: if any((self.args.upvoted, self.args.submitted, self.args.saved)): if self.authenticated: generators = [] + sort_function = self._determine_sort_function() if self.args.upvoted: generators.append(self.reddit_instance.redditor(self.args.user).upvoted) if self.args.submitted: - generators.append(self.reddit_instance.redditor(self.args.user).submissions) + generators.append( + sort_function( + self.reddit_instance.redditor(self.args.user).submissions, + limit=self.args.limit)) if self.args.saved: generators.append(self.reddit_instance.redditor(self.args.user).saved) From 289f7e74472618e014220a8b852c788afb0d3c62 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 13:56:02 +1000 Subject: [PATCH 027/276] Fix bug with logging --- bulkredditdownloader/downloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index b99066d..7138af2 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -82,6 +82,9 @@ class RedditDownloader: self.logfile_directory = self.download_directory / 'LOG_FILES' self.config_directory = self.config_directories.user_config_dir + self.download_directory.mkdir(exist_ok=True, parents=True) + self.logfile_directory.mkdir(exist_ok=True, parents=True) + def _load_config(self): self.cfg_parser = configparser.ConfigParser() if self.args.use_local_config and Path('./config.cfg').exists(): @@ -91,7 +94,7 @@ class RedditDownloader: def _create_file_logger(self): main_logger = logging.getLogger() - file_handler = logging.FileHandler(self.logfile_directory) + file_handler = logging.FileHandler(self.logfile_directory / 'log_output.txt') formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') file_handler.setFormatter(formatter) file_handler.setLevel(0) From f71a3c53264696fd8fa3d2145edb21b6510cb3fc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 13:56:14 +1000 Subject: [PATCH 028/276] Fix wrong logic --- bulkredditdownloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 7138af2..a6cac45 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -197,7 +197,7 @@ class RedditDownloader: if destination.exists(): logger.debug('File already exists: {}'.format(destination)) else: - if res.hash.hexdigest() not in self.master_hash_list and self.args.no_dupes: + if res.hash.hexdigest() not in self.master_hash_list and not self.args.no_dupes: # TODO: consider making a hard link/symlink here destination.parent.mkdir(parents=True, exist_ok=True) with open(destination, 'wb') as file: From 714b6c5b729dfb02656842b5c1ecb82337604bac Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 15:12:27 +1000 Subject: [PATCH 029/276] Remove unused parameter --- bulkredditdownloader/downloader.py | 2 +- bulkredditdownloader/site_downloaders/base_downloader.py | 4 +--- bulkredditdownloader/site_downloaders/direct.py | 6 ++---- bulkredditdownloader/site_downloaders/erome.py | 5 ++--- bulkredditdownloader/site_downloaders/gallery.py | 8 ++++---- bulkredditdownloader/site_downloaders/gfycat.py | 5 ++--- .../site_downloaders/gif_delivery_network.py | 5 ++--- bulkredditdownloader/site_downloaders/imgur.py | 7 +++---- bulkredditdownloader/site_downloaders/redgifs.py | 5 ++--- bulkredditdownloader/site_downloaders/self_post.py | 5 ++--- bulkredditdownloader/site_downloaders/vreddit.py | 4 ++-- bulkredditdownloader/site_downloaders/youtube.py | 5 ++--- .../tests/downloaders/test_base_downloader.py | 6 +++--- 13 files changed, 28 insertions(+), 39 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index a6cac45..57d919b 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -187,7 +187,7 @@ class RedditDownloader: if self.download_filter.check_url(submission.url): try: downloader_class = DownloadFactory.pull_lever(submission.url) - downloader = downloader_class(self.download_directory, submission) + downloader = downloader_class(submission) if self.args.no_download: logger.info('Skipping download for submission {}'.format(submission.id)) else: diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index b3fb7e6..c8cb39a 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -3,7 +3,6 @@ import logging from abc import ABC, abstractmethod -from pathlib import Path import requests from praw.models import Submission @@ -15,8 +14,7 @@ logger = logging.getLogger(__name__) class BaseDownloader(ABC): - def __init__(self, directory: Path, post: Submission): - self.directory = directory + def __init__(self, post: Submission): self.post = post self.hashes = [] diff --git a/bulkredditdownloader/site_downloaders/direct.py b/bulkredditdownloader/site_downloaders/direct.py index cb90752..713eacf 100644 --- a/bulkredditdownloader/site_downloaders/direct.py +++ b/bulkredditdownloader/site_downloaders/direct.py @@ -1,15 +1,13 @@ #!/usr/bin/env python3 -import pathlib - from praw.models import Submission from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader class Direct(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): return [self._download_resource(self.post.url)] diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 84ee3c9..39094f6 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import logging -import pathlib import re import urllib.error import urllib.request @@ -16,8 +15,8 @@ logger = logging.getLogger(__name__) class Erome(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): try: diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 7a4c732..73bb2e9 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -2,7 +2,6 @@ import json import logging -import pathlib import requests from praw.models import Submission @@ -14,8 +13,8 @@ logger = logging.getLogger(__name__) class Gallery(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) link = self.post.url self.raw_data = self._get_data(link) @@ -39,7 +38,8 @@ class Gallery(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" + " Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", } res = requests.get(link, headers=headers) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index 1bc442d..af94596 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import json -import pathlib import re import urllib.request @@ -12,8 +11,8 @@ from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDelive class Gfycat(GifDeliveryNetwork): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): super().download() diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index ba84695..b335ed8 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import pathlib import urllib.request from bs4 import BeautifulSoup @@ -11,8 +10,8 @@ from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader class GifDeliveryNetwork(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): try: diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index d821121..15cefe4 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -2,7 +2,6 @@ import json import logging -import pathlib import requests from praw.models import Submission @@ -18,15 +17,15 @@ class Imgur(BaseDownloader): imgur_image_domain = "https://i.imgur.com/" - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) self.raw_data = {} def download(self): link = self.post.url if link.endswith(".gifv"): - direct_thing = Direct(self.directory, self.post) + direct_thing = Direct(self.post) return direct_thing.download() self.raw_data = self._get_data(link) diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 2c109d7..7cb54fc 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import json -import pathlib import urllib.request from bs4 import BeautifulSoup @@ -12,8 +11,8 @@ from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDelive class Redgifs(GifDeliveryNetwork): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): super().download() diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index 05f576e..cda5c78 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import logging -import pathlib from praw.models import Submission @@ -12,8 +11,8 @@ logger = logging.getLogger(__name__) class SelfPost(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): return Resource(self.post, self.post.url, bytes(self.export_to_string())) diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py index d13bece..40df4b3 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -16,8 +16,8 @@ logger = logging.getLogger(__name__) class VReddit(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): try: diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index b99b2a1..6184d26 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import logging -import pathlib import tempfile import youtube_dl @@ -14,8 +13,8 @@ logger = logging.getLogger(__name__) class Youtube(BaseDownloader): - def __init__(self, directory: pathlib.Path, post: Submission): - super().__init__(directory, post) + def __init__(self, post: Submission): + super().__init__(post) def download(self): return self._download_video() diff --git a/bulkredditdownloader/tests/downloaders/test_base_downloader.py b/bulkredditdownloader/tests/downloaders/test_base_downloader.py index 41c8335..3644abf 100644 --- a/bulkredditdownloader/tests/downloaders/test_base_downloader.py +++ b/bulkredditdownloader/tests/downloaders/test_base_downloader.py @@ -11,8 +11,8 @@ from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader class BlankDownloader(BaseDownloader): - def __init__(self, directory, post): - super().__init__(directory, post) + def __init__(self, post): + super().__init__(post) def download(self) -> list[Resource]: return [self._download_resource(self.post.url)] @@ -24,7 +24,7 @@ class BlankDownloader(BaseDownloader): def test_get_resource(test_url: str, expected_hash: str): mock_submission = Mock mock_submission.url = test_url - downloader = BlankDownloader(Path('.'), mock_submission) + downloader = BlankDownloader(mock_submission) result = downloader.download() assert isinstance(result[0], Resource) assert result[0].hash.hexdigest() == expected_hash From dd2804eb9768d91e1ac827233b2f07ff664823d1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 16:27:57 +1000 Subject: [PATCH 030/276] Remove unused class --- bulkredditdownloader/json_helper.py | 57 ----------------------------- 1 file changed, 57 deletions(-) delete mode 100644 bulkredditdownloader/json_helper.py diff --git a/bulkredditdownloader/json_helper.py b/bulkredditdownloader/json_helper.py deleted file mode 100644 index 5f3f7bb..0000000 --- a/bulkredditdownloader/json_helper.py +++ /dev/null @@ -1,57 +0,0 @@ -import json -import os - -from bulkredditdownloader.errors import InvalidJSONFile - - -class JsonFile: - """ Write and read JSON files - Use add(self,toBeAdded) to add to files - Use delete(self,*deletedKeys) to delete keys - """ - - file_dir = "" - - def __init__(self, file_dir: str): - self.file_dir = file_dir - if not os.path.exists(self.file_dir): - self.__writeToFile({}, create=True) - - def read(self) -> dict: - try: - with open(self.file_dir, 'r') as f: - return json.load(f) - except json.decoder.JSONDecodeError: - raise InvalidJSONFile(f"{self.file_dir} cannot be read") - - def add(self, to_be_added: dict, sub=None) -> dict: - """Takes a dictionary and merges it with json file. - It uses new key's value if a key already exists. - Returns the new content as a dictionary. - """ - data = self.read() - if sub: - data[sub] = {**data[sub], **to_be_added} - else: - data = {**data, **to_be_added} - self.__writeToFile(data) - return self.read() - - def delete(self, *delete_keys: str): - """Delete given keys from JSON file. - Returns the new content as a dictionary. - """ - data = self.read() - for deleteKey in delete_keys: - if deleteKey in data: - del data[deleteKey] - found = True - if not found: - return False - self.__writeToFile(data) - - def __writeToFile(self, content: (dict, list, tuple), create: bool = False): - if not create: - os.remove(self.file_dir) - with open(self.file_dir, 'w') as f: - json.dump(content, f, indent=4) From efffc3ee3fb6c17fc92169561bdda8d29a4e94d0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 16:30:39 +1000 Subject: [PATCH 031/276] Simplify errors --- bulkredditdownloader/errors.py | 14 +++----------- .../site_downloaders/base_downloader.py | 6 +++--- bulkredditdownloader/site_downloaders/gallery.py | 4 ++-- bulkredditdownloader/site_downloaders/imgur.py | 6 +++--- 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/bulkredditdownloader/errors.py b/bulkredditdownloader/errors.py index d7c5041..c677b38 100644 --- a/bulkredditdownloader/errors.py +++ b/bulkredditdownloader/errors.py @@ -4,25 +4,17 @@ class BulkDownloaderException(Exception): pass -class NotADownloadableLinkError(BulkDownloaderException): - pass - - class RedditAuthenticationError(BulkDownloaderException): pass -class InvalidJSONFile(BulkDownloaderException): +class SiteDownloaderError(BulkDownloaderException): pass -class FailedToDownload(BulkDownloaderException): +class NotADownloadableLinkError(SiteDownloaderError): pass -class ImageNotFound(BulkDownloaderException): - pass - - -class ExtensionError(BulkDownloaderException): +class ResourceNotFound(SiteDownloaderError): pass diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index c8cb39a..4a0d871 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -7,7 +7,7 @@ from abc import ABC, abstractmethod import requests from praw.models import Submission -from bulkredditdownloader.errors import FailedToDownload +from bulkredditdownloader.errors import SiteDownloaderError from bulkredditdownloader.resource import Resource logger = logging.getLogger(__name__) @@ -38,7 +38,7 @@ class BaseDownloader(ABC): try: download_content = requests.get(resource_url, headers=headers).content except ConnectionResetError: - raise FailedToDownload + raise SiteDownloaderError return Resource(self.post, resource_url, download_content) - raise FailedToDownload + raise SiteDownloaderError diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 73bb2e9..16b145f 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -6,7 +6,7 @@ import logging import requests from praw.models import Submission -from bulkredditdownloader.errors import ImageNotFound, NotADownloadableLinkError +from bulkredditdownloader.errors import ResourceNotFound, NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -44,7 +44,7 @@ class Gallery(BaseDownloader): } res = requests.get(link, headers=headers) if res.status_code != 200: - raise ImageNotFound(f"Server responded with {res.status_code} to {link}") + raise ResourceNotFound(f"Server responded with {res.status_code} to {link}") page_source = res.text starting_string = "_r = {" diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 15cefe4..d555800 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -6,7 +6,7 @@ import logging import requests from praw.models import Submission -from bulkredditdownloader.errors import ExtensionError, ImageNotFound, NotADownloadableLinkError +from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct @@ -63,7 +63,7 @@ class Imgur(BaseDownloader): cookies = {"over18": "1", "postpagebeta": "0"} res = requests.get(link, cookies=cookies) if res.status_code != 200: - raise ImageNotFound(f"Server responded with {res.status_code} to {link}") + raise ResourceNotFound(f"Server responded with {res.status_code} to {link}") page_source = requests.get(link, cookies=cookies).text starting_string = "image : " @@ -94,4 +94,4 @@ class Imgur(BaseDownloader): if extension in extension_suffix: return extension else: - raise ExtensionError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur') + raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur') From 1b40b16970c4dae86b70f377173bbffed60928bb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 17:45:10 +1000 Subject: [PATCH 032/276] Add logging message --- bulkredditdownloader/downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 57d919b..434968e 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -185,6 +185,7 @@ class RedditDownloader: def _download_submission(self, submission: praw.models.Submission): if self.download_filter.check_url(submission.url): + logger.debug('Attempting to download submission {}'.format(submission.id)) try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) From bb85fb8934aa7a80ecd13831997d8e17a9b6deb2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 17:45:41 +1000 Subject: [PATCH 033/276] Add test for site downloader gallery --- .../site_downloaders/gallery.py | 9 ++++---- .../tests/downloaders/test_gallery.py | 22 +++++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 bulkredditdownloader/tests/downloaders/test_gallery.py diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 16b145f..7125674 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -6,7 +6,7 @@ import logging import requests from praw.models import Submission -from bulkredditdownloader.errors import ResourceNotFound, NotADownloadableLinkError +from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -33,7 +33,7 @@ class Gallery(BaseDownloader): except KeyError: continue - return [self._download_album(images)] + return self._download_album(images) @staticmethod def _get_data(link: str) -> dict: @@ -62,7 +62,6 @@ class Gallery(BaseDownloader): def _download_album(self, images: dict): out = [] - for i, image in enumerate(images): - out.append(self._download_resource(image['url'])) - + for image_key in images.keys(): + out.append(self._download_resource(images[image_key]['url'])) return out diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py new file mode 100644 index 0000000..f590c60 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import praw.models +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.gallery import Gallery + + +@pytest.fixture() +def reddit_submission() -> praw.models.Submission: + rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test') + return rd.submission(id='ljyy27') + + +def test_gallery(reddit_submission: praw.models.Submission): + gallery = Gallery(reddit_submission) + results = gallery.download() + assert len(results) == 4 + assert all([isinstance(result, Resource) for result in results]) From ae5ed7522614baff0cae43bea26e9ae6a847ff1b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 18:05:04 +1000 Subject: [PATCH 034/276] Re-implement --link option --- bulkredditdownloader/__main__.py | 1 + bulkredditdownloader/downloader.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 678e4fe..79abe3a 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -31,6 +31,7 @@ def _add_options(): default=False) parser.add_argument("--link", "-l", help="Get posts from link", + action='append', metavar="link") parser.add_argument("--saved", action="store_true", diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 434968e..bb1207a 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -75,6 +75,7 @@ class RedditDownloader: master_list.extend(self._get_subreddits()) master_list.extend(self._get_multireddits()) master_list.extend(self._get_user_data()) + master_list.extend(self._get_submissions_from_link()) return master_list def _determine_directories(self): @@ -112,6 +113,12 @@ class RedditDownloader: else: return [] + def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: + supplied_submissions = [] + for url in self.args.link: + supplied_submissions.append(self.reddit_instance.submission(url=url)) + return [supplied_submissions] + def _determine_sort_function(self): if self.sort_filter is RedditTypes.SortType.NEW: sort_function = praw.models.Subreddit.new From a75e94e43e549f29a6a8ec5ca2b7743f13947208 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Feb 2021 19:16:51 +1000 Subject: [PATCH 035/276] Restructure test fixtures --- bulkredditdownloader/tests/conftest.py | 11 +++++++++++ .../tests/downloaders/test_gallery.py | 6 ++---- .../tests/test_file_name_formatter.py | 5 ++--- 3 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 bulkredditdownloader/tests/conftest.py diff --git a/bulkredditdownloader/tests/conftest.py b/bulkredditdownloader/tests/conftest.py new file mode 100644 index 0000000..e1de72e --- /dev/null +++ b/bulkredditdownloader/tests/conftest.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import pytest + + +@pytest.fixture(scope='session') +def reddit_instance(): + rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test') + return rd diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index f590c60..1fd41e9 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 -import praw import praw.models import pytest @@ -10,9 +9,8 @@ from bulkredditdownloader.site_downloaders.gallery import Gallery @pytest.fixture() -def reddit_submission() -> praw.models.Submission: - rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test') - return rd.submission(id='ljyy27') +def reddit_submission(reddit_instance) -> praw.models.Submission: + return reddit_instance.submission(id='ljyy27') def test_gallery(reddit_submission: praw.models.Submission): diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 94a6245..d884cb4 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -25,9 +25,8 @@ def submission() -> Mock: @pytest.fixture() -def reddit_submission() -> praw.models.Submission: - rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test') - return rd.submission(id='lgilgt') +def reddit_submission(reddit_instance) -> praw.models.Submission: + return reddit_instance.submission(id='lgilgt') @pytest.mark.parametrize(('format_string', 'expected'), (('{SUBREDDIT}', 'randomreddit'), From e0d321c78563d69e653f3cbf137a0794f2f9d0ab Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 25 Feb 2021 20:40:08 +1000 Subject: [PATCH 036/276] Integrate new base_downloader class --- .../site_downloaders/base_downloader.py | 32 ++++--------------- .../site_downloaders/direct.py | 8 +++-- .../site_downloaders/erome.py | 10 +++--- .../site_downloaders/gallery.py | 9 +++--- .../site_downloaders/gfycat.py | 11 ++++--- .../site_downloaders/gif_delivery_network.py | 11 ++++--- .../site_downloaders/imgur.py | 12 ++++--- .../site_downloaders/redgifs.py | 10 ++++-- .../site_downloaders/self_post.py | 8 +++-- .../site_downloaders/vreddit.py | 10 ++++-- .../site_downloaders/youtube.py | 10 ++++-- .../tests/downloaders/test_base_downloader.py | 30 ----------------- .../tests/downloaders/test_gallery.py | 2 +- 13 files changed, 70 insertions(+), 93 deletions(-) delete mode 100644 bulkredditdownloader/tests/downloaders/test_base_downloader.py diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index 4a0d871..a872953 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -3,42 +3,22 @@ import logging from abc import ABC, abstractmethod +from typing import Optional -import requests from praw.models import Submission -from bulkredditdownloader.errors import SiteDownloaderError +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource logger = logging.getLogger(__name__) class BaseDownloader(ABC): - def __init__(self, post: Submission): + def __init__(self, post: Submission, typical_extension: Optional[str] = None): self.post = post - self.hashes = [] + self.typical_extension = typical_extension @abstractmethod - def download(self) -> list[Resource]: + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + """Return list of all un-downloaded Resources from submission""" raise NotImplementedError - - def _download_resource(self, resource_url: str): - headers = { - "User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 " - "Safari/537.36 OPR/54.0.2952.64", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", - "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", - "Accept-Encoding": "none", - "Accept-Language": "en-US,en;q=0.8", - "Connection": "keep-alive", - } - # Loop to attempt download 3 times - for i in range(3): - try: - download_content = requests.get(resource_url, headers=headers).content - except ConnectionResetError: - raise SiteDownloaderError - return Resource(self.post, resource_url, download_content) - - raise SiteDownloaderError diff --git a/bulkredditdownloader/site_downloaders/direct.py b/bulkredditdownloader/site_downloaders/direct.py index 713eacf..450d409 100644 --- a/bulkredditdownloader/site_downloaders/direct.py +++ b/bulkredditdownloader/site_downloaders/direct.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 +from typing import Optional + from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -9,5 +13,5 @@ class Direct(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): - return [self._download_resource(self.post.url)] + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return [Resource(self.post, self.post.url)] diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 39094f6..8675cee 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -5,10 +5,13 @@ import re import urllib.error import urllib.request from html.parser import HTMLParser +from typing import Optional from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -18,7 +21,7 @@ class Erome(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: try: images = self._get_links(self.post.url) except urllib.error.HTTPError: @@ -29,15 +32,14 @@ class Erome(BaseDownloader): image = images[0] if not re.match(r'https?://.*', image): image = "https://" + image - return [self._download_resource(image)] + return [Resource(self.post, image)] else: out = [] for i, image in enumerate(images): if not re.match(r'https?://.*', image): image = "https://" + image - - out.append(self._download_resource(image)) + out.append(Resource(self.post, image)) return out @staticmethod diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 7125674..8d53056 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -2,11 +2,14 @@ import json import logging +from typing import Optional import requests from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -18,7 +21,7 @@ class Gallery(BaseDownloader): link = self.post.url self.raw_data = self._get_data(link) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: images = {} count = 0 for model in self.raw_data['posts']['models']: @@ -61,7 +64,5 @@ class Gallery(BaseDownloader): return data def _download_album(self, images: dict): - out = [] - for image_key in images.keys(): - out.append(self._download_resource(images[image_key]['url'])) + out = [Resource(self.post, images[image_key]['url']) for image_key in images.keys()] return out diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index af94596..cd33f46 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -3,10 +3,13 @@ import json import re import urllib.request +from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -14,14 +17,12 @@ class Gfycat(GifDeliveryNetwork): def __init__(self, post: Submission): super().__init__(post) - def download(self): - super().download() + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return super().find_resources(authenticator) @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source - and return it - """ + """Extract direct link to the video from page's source and return it """ if re.match(r'\.(webm|mp4|gif)$', url): return url diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index b335ed8..072048e 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 import urllib.request +from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -13,19 +16,17 @@ class GifDeliveryNetwork(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: try: media_url = self._get_link(self.post.url) except IndexError: raise NotADownloadableLinkError("Could not read the page source") - return [self._download_resource(media_url)] + return [Resource(self.post, media_url)] @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source - and return it - """ + """Extract direct link to the video from page's source and return it""" if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]: return url diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index d555800..2111b44 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -2,11 +2,14 @@ import json import logging +from typing import Optional import requests from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct @@ -14,19 +17,18 @@ logger = logging.getLogger(__name__) class Imgur(BaseDownloader): - imgur_image_domain = "https://i.imgur.com/" def __init__(self, post: Submission): super().__init__(post) self.raw_data = {} - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: link = self.post.url if link.endswith(".gifv"): direct_thing = Direct(self.post) - return direct_thing.download() + return direct_thing.find_resources(authenticator) self.raw_data = self._get_data(link) @@ -47,13 +49,13 @@ class Imgur(BaseDownloader): for i in range(images_length): extension = self._validate_extension(images["images"][i]["ext"]) image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension - out.append(self._download_resource(image_url)) + out.append(Resource(self.post, image_url)) return out def _download_image(self, image: dict): extension = self._validate_extension(image["ext"]) image_url = self.imgur_image_domain + image["hash"] + extension - return [self._download_resource(image_url)] + return [Resource(self.post, image_url)] def _is_album(self) -> bool: return "album_images" in self.raw_data diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 7cb54fc..3e8ad8e 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -2,11 +2,14 @@ import json import urllib.request +from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -14,8 +17,8 @@ class Redgifs(GifDeliveryNetwork): def __init__(self, post: Submission): super().__init__(post) - def download(self): - super().download() + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return super().find_resources(authenticator) @staticmethod def _get_link(url: str) -> str: @@ -31,7 +34,8 @@ class Redgifs(GifDeliveryNetwork): url.add_header( 'User-Agent', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64') + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64') page_source = (urllib.request.urlopen(url).read().decode()) diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index cda5c78..f01b6f1 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 import logging +from typing import Optional from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -14,8 +16,10 @@ class SelfPost(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): - return Resource(self.post, self.post.url, bytes(self.export_to_string())) + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + out = Resource(self.post, self.post.url) + out.content = self.export_to_string() + return out def export_to_string(self) -> str: """Self posts are formatted here""" diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py index 40df4b3..c92bf8a 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -5,10 +5,12 @@ import os import pathlib import subprocess import tempfile +from typing import Optional import requests from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -19,12 +21,12 @@ class VReddit(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: try: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) except subprocess.SubprocessError: - return self._download_resource(self.post.url) + return [Resource(self.post, self.post.url)] else: video_url = self.post.url audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' @@ -39,7 +41,9 @@ class VReddit(BaseDownloader): self._merge_audio(temp_dir) with open(temp_dir / 'output.mp4', 'rb') as file: content = file.read() - return Resource(self.post, self.post.url, content) + out = Resource(self.post, self.post.url) + out.content = content + return out @staticmethod def _merge_audio(working_directory: pathlib.Path): diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index 6184d26..d9da907 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -2,10 +2,12 @@ import logging import tempfile +from typing import Optional import youtube_dl from praw.models import Submission +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -16,8 +18,8 @@ class Youtube(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def download(self): - return self._download_video() + def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + return [self._download_video()] def _download_video(self) -> Resource: with tempfile.TemporaryDirectory() as temp_dir: @@ -33,4 +35,6 @@ class Youtube(BaseDownloader): with open(temp_dir / 'test.mp4', 'rb') as file: content = file.read() - return Resource(self.post, self.post.url, content) + out = Resource(self.post, self.post.url) + out.content = content + return out diff --git a/bulkredditdownloader/tests/downloaders/test_base_downloader.py b/bulkredditdownloader/tests/downloaders/test_base_downloader.py deleted file mode 100644 index 3644abf..0000000 --- a/bulkredditdownloader/tests/downloaders/test_base_downloader.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -# coding=utf-8 - -from pathlib import Path -from unittest.mock import Mock - -import pytest - -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader - - -class BlankDownloader(BaseDownloader): - def __init__(self, post): - super().__init__(post) - - def download(self) -> list[Resource]: - return [self._download_resource(self.post.url)] - - -@pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://docs.python.org/3/_static/py.png', 'a721fc7ec672275e257bbbfde49a4d4e'), -)) -def test_get_resource(test_url: str, expected_hash: str): - mock_submission = Mock - mock_submission.url = test_url - downloader = BlankDownloader(mock_submission) - result = downloader.download() - assert isinstance(result[0], Resource) - assert result[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index 1fd41e9..11a0651 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -15,6 +15,6 @@ def reddit_submission(reddit_instance) -> praw.models.Submission: def test_gallery(reddit_submission: praw.models.Submission): gallery = Gallery(reddit_submission) - results = gallery.download() + results = gallery.find_resources() assert len(results) == 4 assert all([isinstance(result, Resource) for result in results]) From 28f5ea69c3359b60eb823c79c3858b0f58b84012 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 18:55:48 +1000 Subject: [PATCH 037/276] Add authenticator module --- bulkredditdownloader/authenticator.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 bulkredditdownloader/authenticator.py diff --git a/bulkredditdownloader/authenticator.py b/bulkredditdownloader/authenticator.py new file mode 100644 index 0000000..efe63e0 --- /dev/null +++ b/bulkredditdownloader/authenticator.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# coding=utf-8 + + +class Authenticator: + def __init__(self): + self.imgur_authentication = None From 228cd5f687736112d097880bd741172a46a8ba5e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 18:56:05 +1000 Subject: [PATCH 038/276] Change Resource model --- bulkredditdownloader/resource.py | 53 ++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index cf0ed90..e408de2 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -3,25 +3,52 @@ import hashlib import re +import time +from typing import Optional +import _hashlib +import requests from praw.models import Submission +from bulkredditdownloader.errors import BulkDownloaderException + class Resource: - def __init__(self, source_submission: Submission, url: str, content: bytes): + def __init__(self, source_submission: Submission, url: str, extension: str = None): self.source_submission = source_submission - self.content = content + self.content: Optional[bytes] = None self.url = url - self.hash = hashlib.md5(content) - self.extension = self._get_extension(url) + self.hash: Optional[_hashlib.HASH] = None + self.extension = extension + if not self.extension: + self.extension = self._determine_extension() @staticmethod - def _get_extension(url: str) -> str: - pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))') - if results := re.search(pattern, url): - if len(results.groups()) > 1: - return results[0] - if "v.redd.it" not in url: - return '.jpg' - else: - return '.mp4' + def retry_download(url: str, wait_time: int) -> Optional[bytes]: + try: + response = requests.get(url) + if response.status_code == 200: + return response.content + else: + raise requests.exceptions.ConnectionError + except requests.exceptions.ConnectionError: + time.sleep(wait_time) + if wait_time < 300: + return Resource.retry_download(url, wait_time + 60) + else: + return None + + def download(self): + if not self.content: + content = self.retry_download(self.url, 0) + if content: + self.content = content + self.hash = hashlib.md5(self.content) + else: + raise BulkDownloaderException('Could not download resource') + + def _determine_extension(self) -> str: + extension_pattern = r'.*(\..{3,5})$' + match = re.search(extension_pattern, self.url) + if match: + return match.group(1) From 3bc305c037319ab68dbd8a122435174fb9c44336 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 18:56:21 +1000 Subject: [PATCH 039/276] Implement changes in downloader --- bulkredditdownloader/downloader.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index bb1207a..6c1368f 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -13,6 +13,7 @@ import appdirs import praw import praw.models +from bulkredditdownloader.authenticator import Authenticator from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.errors import NotADownloadableLinkError, RedditAuthenticationError from bulkredditdownloader.file_name_formatter import FileNameFormatter @@ -52,18 +53,18 @@ class RedditDownloader: self.time_filter = self._create_time_filter() self.sort_filter = self._create_sort_filter() self.file_name_formatter = self._create_file_name_formatter() + self.authenticator = self._create_authenticator() self._determine_directories() self._create_file_logger() self.master_hash_list = [] self._load_config() - if self.cfg_parser.has_option('DEFAULT', 'username') and self.cfg_parser.has_option('DEFAULT', 'password'): + if self.cfg_parser.has_option('DEFAULT', 'reddit_token'): + # TODO: implement OAuth2 authentication self.authenticated = True - self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), user_agent=socket.gethostname(), - username=self.cfg_parser.get('DEFAULT', 'username'), - password=self.cfg_parser.get('DEFAULT', 'password')) + ) else: self.authenticated = False self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), @@ -185,6 +186,9 @@ class RedditDownloader: excluded_extensions = [extension for ext_type in self.args.skip for extension in formats.get(ext_type, ())] return DownloadFilter(excluded_extensions, self.args.skip_domain) + def _create_authenticator(self) -> Authenticator: + raise NotImplementedError + def download(self): for generator in self.reddit_lists: for submission in generator: @@ -199,7 +203,7 @@ class RedditDownloader: if self.args.no_download: logger.info('Skipping download for submission {}'.format(submission.id)) else: - content = downloader.download() + content = downloader.find_resources(self.authenticator) for res in content: destination = self.file_name_formatter.format_path(res, self.download_directory) if destination.exists(): From 0973e1e4515fb01f1a152786aae4380774820663 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 18:57:05 +1000 Subject: [PATCH 040/276] Rename file and class --- bulkredditdownloader/downloader.py | 4 ++-- .../{authenticator.py => site_authenticator.py} | 2 +- bulkredditdownloader/site_downloaders/base_downloader.py | 4 ++-- bulkredditdownloader/site_downloaders/direct.py | 4 ++-- bulkredditdownloader/site_downloaders/erome.py | 4 ++-- bulkredditdownloader/site_downloaders/gallery.py | 4 ++-- bulkredditdownloader/site_downloaders/gfycat.py | 4 ++-- bulkredditdownloader/site_downloaders/gif_delivery_network.py | 4 ++-- bulkredditdownloader/site_downloaders/imgur.py | 4 ++-- bulkredditdownloader/site_downloaders/redgifs.py | 4 ++-- bulkredditdownloader/site_downloaders/self_post.py | 4 ++-- bulkredditdownloader/site_downloaders/vreddit.py | 4 ++-- bulkredditdownloader/site_downloaders/youtube.py | 4 ++-- 13 files changed, 25 insertions(+), 25 deletions(-) rename bulkredditdownloader/{authenticator.py => site_authenticator.py} (80%) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 6c1368f..d6a5e5f 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -13,7 +13,7 @@ import appdirs import praw import praw.models -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.errors import NotADownloadableLinkError, RedditAuthenticationError from bulkredditdownloader.file_name_formatter import FileNameFormatter @@ -186,7 +186,7 @@ class RedditDownloader: excluded_extensions = [extension for ext_type in self.args.skip for extension in formats.get(ext_type, ())] return DownloadFilter(excluded_extensions, self.args.skip_domain) - def _create_authenticator(self) -> Authenticator: + def _create_authenticator(self) -> SiteAuthenticator: raise NotImplementedError def download(self): diff --git a/bulkredditdownloader/authenticator.py b/bulkredditdownloader/site_authenticator.py similarity index 80% rename from bulkredditdownloader/authenticator.py rename to bulkredditdownloader/site_authenticator.py index efe63e0..93cebf7 100644 --- a/bulkredditdownloader/authenticator.py +++ b/bulkredditdownloader/site_authenticator.py @@ -2,6 +2,6 @@ # coding=utf-8 -class Authenticator: +class SiteAuthenticator: def __init__(self): self.imgur_authentication = None diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index a872953..458f3bc 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -7,7 +7,7 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource logger = logging.getLogger(__name__) @@ -19,6 +19,6 @@ class BaseDownloader(ABC): self.typical_extension = typical_extension @abstractmethod - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: """Return list of all un-downloaded Resources from submission""" raise NotImplementedError diff --git a/bulkredditdownloader/site_downloaders/direct.py b/bulkredditdownloader/site_downloaders/direct.py index 450d409..6ab3d22 100644 --- a/bulkredditdownloader/site_downloaders/direct.py +++ b/bulkredditdownloader/site_downloaders/direct.py @@ -4,7 +4,7 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -13,5 +13,5 @@ class Direct(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: return [Resource(self.post, self.post.url)] diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 8675cee..1220651 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -9,7 +9,7 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -21,7 +21,7 @@ class Erome(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: try: images = self._get_links(self.post.url) except urllib.error.HTTPError: diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 8d53056..012295f 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -7,7 +7,7 @@ from typing import Optional import requests from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -21,7 +21,7 @@ class Gallery(BaseDownloader): link = self.post.url self.raw_data = self._get_data(link) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: images = {} count = 0 for model in self.raw_data['posts']['models']: diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index cd33f46..3b6c48e 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -8,7 +8,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -17,7 +17,7 @@ class Gfycat(GifDeliveryNetwork): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: return super().find_resources(authenticator) @staticmethod diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 072048e..03c291d 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -6,7 +6,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -16,7 +16,7 @@ class GifDeliveryNetwork(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: try: media_url = self._get_link(self.post.url) except IndexError: diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 2111b44..80d6b3b 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -7,7 +7,7 @@ from typing import Optional import requests from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -23,7 +23,7 @@ class Imgur(BaseDownloader): super().__init__(post) self.raw_data = {} - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: link = self.post.url if link.endswith(".gifv"): diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 3e8ad8e..5517be2 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -7,7 +7,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -17,7 +17,7 @@ class Redgifs(GifDeliveryNetwork): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: return super().find_resources(authenticator) @staticmethod diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index f01b6f1..749a824 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -5,7 +5,7 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -16,7 +16,7 @@ class SelfPost(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: out = Resource(self.post, self.post.url) out.content = self.export_to_string() return out diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py index c92bf8a..83823fc 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -10,7 +10,7 @@ from typing import Optional import requests from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -21,7 +21,7 @@ class VReddit(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: try: fnull = open(os.devnull, 'w') subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index d9da907..369835f 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -7,7 +7,7 @@ from typing import Optional import youtube_dl from praw.models import Submission -from bulkredditdownloader.authenticator import Authenticator +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -18,7 +18,7 @@ class Youtube(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - def find_resources(self, authenticator: Optional[Authenticator] = None) -> list[Resource]: + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: return [self._download_video()] def _download_video(self) -> Resource: From 6fd7aca9813ff6a3ad04873a2c77b113b0045787 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 19:05:19 +1000 Subject: [PATCH 041/276] Add a test for Resource --- bulkredditdownloader/tests/test_resource.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 bulkredditdownloader/tests/test_resource.py diff --git a/bulkredditdownloader/tests/test_resource.py b/bulkredditdownloader/tests/test_resource.py new file mode 100644 index 0000000..209f8cb --- /dev/null +++ b/bulkredditdownloader/tests/test_resource.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import pytest + +from bulkredditdownloader.resource import Resource + + +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('test.png', '.png'), + ('another.mp4', '.mp4'), + ('test.jpeg', '.jpeg'), + ('http://www.random.com/resource.png', '.png'), + ('https://www.resource.com/test/example.jpg', '.jpg'), + ('hard.png.mp4', '.mp4'), +)) +def test_resource_get_extension(test_url: str, expected: str): + test_resource = Resource(None, test_url) + result = test_resource._determine_extension() + assert result == expected From 0652f53b507bf1fb9219c06ad3e053714e75c4c0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 19:09:25 +1000 Subject: [PATCH 042/276] Mark online tests --- bulkredditdownloader/tests/downloaders/test_gallery.py | 1 + bulkredditdownloader/tests/test_file_name_formatter.py | 4 +++- pytest.ini | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 pytest.ini diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index 11a0651..aa8a57e 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -13,6 +13,7 @@ def reddit_submission(reddit_instance) -> praw.models.Submission: return reddit_instance.submission(id='ljyy27') +@pytest.mark.online def test_gallery(reddit_submission: praw.models.Submission): gallery = Gallery(reddit_submission) results = gallery.find_resources() diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index d884cb4..abe67ae 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -42,6 +42,7 @@ def test_format_name_mock(format_string: str, expected: str, submission: Mock): assert result == expected +@pytest.mark.online @pytest.mark.parametrize(('format_string', 'expected'), (('{SUBREDDIT}', 'Mindustry'), ('{REDDITOR}', 'Gamer_player_boi'), @@ -55,6 +56,7 @@ def test_format_name_real(format_string: str, expected: str, reddit_submission: assert result == expected +@pytest.mark.online @pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'expected'), (('{SUBREDDIT}', '{POSTID}', 'test/Mindustry/lgilgt.png'), ('{SUBREDDIT}', '{TITLE}_{POSTID}', @@ -67,7 +69,7 @@ def test_format_full( format_string_file: str, expected: str, reddit_submission: praw.models.Submission): - test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png', b'') + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory) result = test_formatter.format_path(test_resource, Path('test')) assert str(result) == expected diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..ccd0a19 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + online: tests require a connection to the internet \ No newline at end of file From 0c6a8e46a2c2f7b997f9072fdb8ce194082af00d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 19:19:12 +1000 Subject: [PATCH 043/276] Add reddit flag for pytest --- bulkredditdownloader/tests/downloaders/test_gallery.py | 1 + bulkredditdownloader/tests/test_file_name_formatter.py | 2 ++ pytest.ini | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index aa8a57e..2b29a4b 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -14,6 +14,7 @@ def reddit_submission(reddit_instance) -> praw.models.Submission: @pytest.mark.online +@pytest.mark.reddit def test_gallery(reddit_submission: praw.models.Submission): gallery = Gallery(reddit_submission) results = gallery.find_resources() diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index abe67ae..7d18dd2 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -43,6 +43,7 @@ def test_format_name_mock(format_string: str, expected: str, submission: Mock): @pytest.mark.online +@pytest.mark.reddit @pytest.mark.parametrize(('format_string', 'expected'), (('{SUBREDDIT}', 'Mindustry'), ('{REDDITOR}', 'Gamer_player_boi'), @@ -57,6 +58,7 @@ def test_format_name_real(format_string: str, expected: str, reddit_submission: @pytest.mark.online +@pytest.mark.reddit @pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'expected'), (('{SUBREDDIT}', '{POSTID}', 'test/Mindustry/lgilgt.png'), ('{SUBREDDIT}', '{TITLE}_{POSTID}', diff --git a/pytest.ini b/pytest.ini index ccd0a19..428fbcb 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] markers = - online: tests require a connection to the internet \ No newline at end of file + online: tests require a connection to the internet + reddit: tests require a connection to Reddit \ No newline at end of file From 37a91aa4dff73c64cf10bcb82585dae38d218b6b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 19:23:45 +1000 Subject: [PATCH 044/276] Add test for Resource --- bulkredditdownloader/tests/test_resource.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bulkredditdownloader/tests/test_resource.py b/bulkredditdownloader/tests/test_resource.py index 209f8cb..3f9b976 100644 --- a/bulkredditdownloader/tests/test_resource.py +++ b/bulkredditdownloader/tests/test_resource.py @@ -18,3 +18,13 @@ def test_resource_get_extension(test_url: str, expected: str): test_resource = Resource(None, test_url) result = test_resource._determine_extension() assert result == expected + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'), +)) +def test_download_online_resource(test_url: str, expected_hash: str): + test_resource = Resource(None, test_url) + test_resource.download() + assert test_resource.hash.hexdigest() == expected_hash From a55f35c02568a4fe4ac6812bf2064da7e01e96ce Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 19:38:29 +1000 Subject: [PATCH 045/276] Replace urllib with requests --- bulkredditdownloader/site_downloaders/gfycat.py | 6 +++--- .../site_downloaders/gif_delivery_network.py | 6 +++--- bulkredditdownloader/site_downloaders/redgifs.py | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index 3b6c48e..af1f45b 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -2,14 +2,14 @@ import json import re -import urllib.request from typing import Optional +import requests from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -31,7 +31,7 @@ class Gfycat(GifDeliveryNetwork): url = "https://gfycat.com/" + url.split('/')[-1] - page_source = (urllib.request.urlopen(url).read().decode()) + page_source = requests.get(url).text soup = BeautifulSoup(page_source, "html.parser") attributes = {"data-react-helmet": "true", "type": "application/ld+json"} diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 03c291d..eab4ac0 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -1,14 +1,14 @@ #!/usr/bin/env python3 -import urllib.request from typing import Optional +import requests from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -34,7 +34,7 @@ class GifDeliveryNetwork(BaseDownloader): url = url[:-1] url = "https://www.gifdeliverynetwork.com/" + url.split('/')[-1] - page_source = (urllib.request.urlopen(url).read().decode()) + page_source = requests.get(url).text soup = BeautifulSoup(page_source, "html.parser") attributes = {"id": "mp4Source", "type": "video/mp4"} diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 5517be2..5953fa7 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 import json -import urllib.request from typing import Optional +import requests from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -29,15 +29,15 @@ class Redgifs(GifDeliveryNetwork): if url[-1:] == '/': url = url[:-1] - url = urllib.request.Request( - "https://redgifs.com/watch/" + url.split('/')[-1]) + url = "https://redgifs.com/watch/" + url.split('/')[-1] - url.add_header( + headers = [ 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64') + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64' + ] - page_source = (urllib.request.urlopen(url).read().decode()) + page_source = requests.get(url, headers=headers).text soup = BeautifulSoup(page_source, "html.parser") attributes = {"data-react-helmet": "true", "type": "application/ld+json"} From 4146f181c732384b6be07ec23d5c27c0548b7a32 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 19:45:03 +1000 Subject: [PATCH 046/276] Add tests for GifDeliveryNetwork and Gfycat --- .../site_downloaders/gif_delivery_network.py | 2 +- .../tests/downloaders/test_gfycat.py | 35 ++++++++++++++++++ .../downloaders/test_gif_delivery_network.py | 37 +++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 bulkredditdownloader/tests/downloaders/test_gfycat.py create mode 100644 bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index eab4ac0..ba1dc41 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -22,7 +22,7 @@ class GifDeliveryNetwork(BaseDownloader): except IndexError: raise NotADownloadableLinkError("Could not read the page source") - return [Resource(self.post, media_url)] + return [Resource(self.post, media_url, '.mp4')] @staticmethod def _get_link(url: str) -> str: diff --git a/bulkredditdownloader/tests/downloaders/test_gfycat.py b/bulkredditdownloader/tests/downloaders/test_gfycat.py new file mode 100644 index 0000000..cca2f4a --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_gfycat.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import Mock + +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.gfycat import Gfycat + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_url'), ( + ('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'), + ('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'), +)) +def test_get_link(test_url: str, expected_url: str): + result = Gfycat._get_link(test_url) + assert result == expected_url + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://gfycat.com/definitivecaninecrayfish', '48f9bd4dbec1556d7838885612b13b39'), + ('https://gfycat.com/dazzlingsilkyiguana', '808941b48fc1e28713d36dd7ed9dc648'), +)) +def test_download_resource(test_url: str, expected_hash: str): + mock_submission = Mock + mock_submission.url = test_url + test_site = Gfycat(mock_submission) + resources = test_site.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download() + assert resources[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py b/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py new file mode 100644 index 0000000..e790fa7 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import Mock + +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('https://www.gifdeliverynetwork.com/handyunsightlydesertpupfish', + 'https://thumbs2.redgifs.com/HandyUnsightlyDesertpupfish.mp4'), + ('https://www.gifdeliverynetwork.com/lamelikelyhamadryad', + 'https://thumbs2.redgifs.com/LameLikelyHamadryad.mp4') +)) +def test_get_link(test_url: str, expected: str): + result = GifDeliveryNetwork._get_link(test_url) + assert result == expected + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://www.gifdeliverynetwork.com/handyunsightlydesertpupfish', 'd941460dcf4e0d09dd33abaa32e2d270'), + ('https://www.gifdeliverynetwork.com/lamelikelyhamadryad', '4806fe15f4991bb73581338793488daf'), +)) +def test_download_resource(test_url: str, expected_hash: str): + mock_submission = Mock + mock_submission.url = test_url + test_site = GifDeliveryNetwork(mock_submission) + resources = test_site.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download() + assert resources[0].hash.hexdigest() == expected_hash From 06e7e81de03bb27766cbdeb36b9c91329c192541 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 27 Feb 2021 08:26:42 +1000 Subject: [PATCH 047/276] Add tests for Redgifs --- .../site_downloaders/redgifs.py | 9 ++--- .../tests/downloaders/test_redgifs.py | 37 +++++++++++++++++++ 2 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 bulkredditdownloader/tests/downloaders/test_redgifs.py diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 5953fa7..e5816b7 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -31,11 +31,10 @@ class Redgifs(GifDeliveryNetwork): url = "https://redgifs.com/watch/" + url.split('/')[-1] - headers = [ - 'User-Agent', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64' - ] + headers = {'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64' + } page_source = requests.get(url, headers=headers).text diff --git a/bulkredditdownloader/tests/downloaders/test_redgifs.py b/bulkredditdownloader/tests/downloaders/test_redgifs.py new file mode 100644 index 0000000..a3fbef4 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_redgifs.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import Mock + +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.redgifs import Redgifs + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('https://www.redgifs.com/watch/forcefulenchantedanaconda', + 'https://thumbs2.redgifs.com/ForcefulEnchantedAnaconda.mp4'), + ('https://www.redgifs.com/watch/ficklelightirishsetter', + 'https://thumbs2.redgifs.com/FickleLightIrishsetter.mp4'), +)) +def test_get_link(test_url: str, expected: str): + result = Redgifs._get_link(test_url) + assert result == expected + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://www.redgifs.com/watch/forcefulenchantedanaconda', '75a23fff6ddec5de3b61d53db1f265a4'), + ('https://www.redgifs.com/watch/ficklelightirishsetter', 'd0ea030883c9a3a6a2991f5aa61369e7'), +)) +def test_download_resource(test_url: str, expected_hash: str): + mock_submission = Mock + mock_submission.url = test_url + test_site = Redgifs(mock_submission) + resources = test_site.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download() + assert resources[0].hash.hexdigest() == expected_hash From d1c0a7ece22b707dacefc3563d90850a74568f6e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 27 Feb 2021 08:30:52 +1000 Subject: [PATCH 048/276] Add test for downloader Direct --- .../tests/downloaders/test_direct.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 bulkredditdownloader/tests/downloaders/test_direct.py diff --git a/bulkredditdownloader/tests/downloaders/test_direct.py b/bulkredditdownloader/tests/downloaders/test_direct.py new file mode 100644 index 0000000..32cc483 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_direct.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import Mock + +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.direct import Direct + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4', '48f9bd4dbec1556d7838885612b13b39'), + ('https://giant.gfycat.com/DazzlingSilkyIguana.mp4', '808941b48fc1e28713d36dd7ed9dc648'), +)) +def test_download_resource(test_url: str, expected_hash: str): + mock_submission = Mock + mock_submission.url = test_url + test_site = Direct(mock_submission) + resources = test_site.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download() + assert resources[0].hash.hexdigest() == expected_hash From 2a0dd4f6ac034331c99fa21af2190aa0f175f6c4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 27 Feb 2021 09:26:02 +1000 Subject: [PATCH 049/276] Add slow mark for tests --- pytest.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 428fbcb..ca6ae59 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,5 @@ [pytest] markers = online: tests require a connection to the internet - reddit: tests require a connection to Reddit \ No newline at end of file + reddit: tests require a connection to Reddit + slow: test is slow to run \ No newline at end of file From 125b78a34822b1021bf4911e08fb2178eff2c1a6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 27 Feb 2021 10:35:43 +1000 Subject: [PATCH 050/276] Add tests for downloader Erome --- .../site_downloaders/erome.py | 6 +-- .../tests/downloaders/test_erome.py | 54 +++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 bulkredditdownloader/tests/downloaders/test_erome.py diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 1220651..9f62738 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -23,13 +23,13 @@ class Erome(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: try: - images = self._get_links(self.post.url) + images = set(self._get_links(self.post.url)) except urllib.error.HTTPError: raise NotADownloadableLinkError("Not a downloadable link") if len(images) == 1: - image = images[0] + image = images.pop() if not re.match(r'https?://.*', image): image = "https://" + image return [Resource(self.post, image)] @@ -39,7 +39,7 @@ class Erome(BaseDownloader): for i, image in enumerate(images): if not re.match(r'https?://.*', image): image = "https://" + image - out.append(Resource(self.post, image)) + out.append(Resource(self.post, image)) return out @staticmethod diff --git a/bulkredditdownloader/tests/downloaders/test_erome.py b/bulkredditdownloader/tests/downloaders/test_erome.py new file mode 100644 index 0000000..a0cba97 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_erome.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import Mock + +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.erome import Erome + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_urls'), ( + ('https://www.erome.com/a/hzLCb2c5', + ('https://s2.erome.com/353/hzLCb2c5/8FNh4qa8.jpg', 'https://s2.erome.com/353/hzLCb2c5/8FNh4qa8_480p.mp4') + ), + ('https://www.erome.com/a/ORhX0FZz', + ('https://s4.erome.com/355/ORhX0FZz/9IYQocM9.jpg', + 'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm.jpg', + 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp.jpg', + 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/LruobtMs.jpg', + 'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5.jpg', + 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z.jpg', + 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7.jpg', + 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4') + ), +)) +def test_get_link(test_url: str, expected_urls: tuple[str]): + result = Erome. _get_links(test_url) + assert set(result) == set(expected_urls) + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize(('test_url', 'expected_number_of_resources', 'expected_hashes'), ( + ('https://www.erome.com/a/hzLCb2c5', 2, + ('1b4b1703f81f2ad6a622f7319a4651c2', 'f24388a0f3443c1a27594e4af41c3e83') + ), +)) +def test_download_resource(test_url: str, expected_number_of_resources: int, expected_hashes: tuple[str]): + mock_submission = Mock + mock_submission.url = test_url + test_site = Erome(mock_submission) + resources = test_site.find_resources() + assert len(resources) == expected_number_of_resources + [res.download() for res in resources] + resource_hashes = [res.hash.hexdigest() for res in resources] + assert set(resource_hashes) == set(expected_hashes) From b699639b5c4f248d7cd9f59ba3e1d6f8d25cfa04 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 27 Feb 2021 16:22:42 +1000 Subject: [PATCH 051/276] Add tests for Imgur --- .../site_downloaders/imgur.py | 2 +- .../tests/downloaders/test_imgur.py | 75 +++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 bulkredditdownloader/tests/downloaders/test_imgur.py diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 80d6b3b..7804de9 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -32,7 +32,7 @@ class Imgur(BaseDownloader): self.raw_data = self._get_data(link) - if self._is_album: + if self._is_album(): if self.raw_data["album_images"]["count"] != 1: out = self._download_album(self.raw_data["album_images"]) else: diff --git a/bulkredditdownloader/tests/downloaders/test_imgur.py b/bulkredditdownloader/tests/downloaders/test_imgur.py new file mode 100644 index 0000000..b89bbb8 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_imgur.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import Mock + +import pytest + +from bulkredditdownloader.errors import SiteDownloaderError +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.imgur import Imgur + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_gen_dict', 'expected_image_dict'), ( + ('https://imgur.com/a/xWZsDDP', + {'num_images': '1', 'id': 'xWZsDDP', 'hash': 'xWZsDDP'}, + [{'hash': 'ypa8YfS', 'title': '', 'ext': '.png', 'animated': False}]), + ('https://imgur.com/gallery/IjJJdlC', + {'num_images': 1, 'id': 384898055, 'hash': 'IjJJdlC'}, + [{'hash': 'CbbScDt', 'description': 'watch when he gets it', 'ext': '.gif', 'animated': True, 'has_sound': False}], + ), + ('https://imgur.com/a/dcc84Gt', + {'num_images': '4', 'id': 'dcc84Gt', 'hash': 'dcc84Gt'}, + [ + {'hash': 'ylx0Kle', 'ext': '.jpg', 'title': ''}, + {'hash': 'TdYfKbK', 'ext': '.jpg', 'title': ''}, + {'hash': 'pCxGbe8', 'ext': '.jpg', 'title': ''}, + {'hash': 'TSAkikk', 'ext': '.jpg', 'title': ''}, + ]), +)) +def test_get_data(test_url: str, expected_gen_dict: dict, expected_image_dict: list[dict]): + result = Imgur._get_data(test_url) + assert all([result.get(key) == expected_gen_dict[key] for key in expected_gen_dict.keys()]) + + # Check if all the keys from the test dict are correct in at least one of the album entries + assert any([all([image.get(key) == image_dict[key] for key in image_dict.keys()]) + for image_dict in expected_image_dict for image in result['album_images']['images']]) + + +@pytest.mark.parametrize('test_extension', + ('.gif', '.png', '.jpg', '.mp4') + ) +def test_imgur_extension_validation_good(test_extension: str): + result = Imgur._validate_extension(test_extension) + assert result == test_extension + + +@pytest.mark.parametrize('test_extension', + ('.jpeg', '.avi', 'bad', '.test', '.flac') + ) +def test_imgur_extension_validation_bad(test_extension: str): + with pytest.raises(SiteDownloaderError): + Imgur._validate_extension(test_extension) + + +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( + ('https://imgur.com/a/xWZsDDP', ('f551d6e6b0fef2ce909767338612e31b',)), + ('https://imgur.com/gallery/IjJJdlC', ('7227d4312a9779b74302724a0cfa9081',)), + ('https://imgur.com/a/dcc84Gt', + ('cf1158e1de5c3c8993461383b96610cf', + '28d6b791a2daef8aa363bf5a3198535d', + '248ef8f2a6d03eeb2a80d0123dbaf9b6', + '029c475ce01b58fdf1269d8771d33913')), +)) +def test_find_resources(test_url: str, expected_hashes: list[str]): + mock_download = Mock() + mock_download.url = test_url + downloader = Imgur(mock_download) + results = downloader.find_resources() + assert all([isinstance(res, Resource) for res in results]) + [res.download() for res in results] + hashes = set([res.hash.hexdigest() for res in results]) + assert len(results) == len(expected_hashes) + assert hashes == set(expected_hashes) From be68d4eb1c01d0c324c2870065b192816b5c98ad Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 28 Feb 2021 09:40:42 +1000 Subject: [PATCH 052/276] Add tests for SelfPost --- bulkredditdownloader/resource.py | 5 +++- .../site_downloaders/self_post.py | 7 +++--- .../tests/downloaders/test_self_post.py | 23 +++++++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 bulkredditdownloader/tests/downloaders/test_self_post.py diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index e408de2..a944b3d 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -43,10 +43,13 @@ class Resource: content = self.retry_download(self.url, 0) if content: self.content = content - self.hash = hashlib.md5(self.content) + self.create_hash() else: raise BulkDownloaderException('Could not download resource') + def create_hash(self): + self.hash = hashlib.md5(self.content) + def _determine_extension(self) -> str: extension_pattern = r'.*(\..{3,5})$' match = re.search(extension_pattern, self.url) diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index 749a824..265a321 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -5,8 +5,8 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -18,8 +18,9 @@ class SelfPost(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: out = Resource(self.post, self.post.url) - out.content = self.export_to_string() - return out + out.content = self.export_to_string().encode('utf-8') + out.create_hash() + return [out] def export_to_string(self) -> str: """Self posts are formatted here""" diff --git a/bulkredditdownloader/tests/downloaders/test_self_post.py b/bulkredditdownloader/tests/downloaders/test_self_post.py new file mode 100644 index 0000000..d6a45f4 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_self_post.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.self_post import SelfPost + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( + ('ltmivt', '7d2c9e4e989e5cf2dca2e55a06b1c4f6'), + ('ltoaan', '221606386b614d6780c2585a59bd333f'), +)) +def test_find_resource(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): + submission = reddit_instance.submission(id=test_submission_id) + downloader = SelfPost(submission) + results = downloader.find_resources() + assert len(results) == 1 + assert isinstance(results[0], Resource) + assert results[0].hash.hexdigest() == expected_hash From caefb591dd28c1801ed9ab580e1aa82f6790585e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 28 Feb 2021 14:52:20 +1000 Subject: [PATCH 053/276] Add tests for Youtube --- .../site_downloaders/youtube.py | 13 +++++++---- .../tests/downloaders/test_youtube.py | 23 +++++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 bulkredditdownloader/tests/downloaders/test_youtube.py diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index 369835f..ff770a3 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -2,13 +2,14 @@ import logging import tempfile +from pathlib import Path from typing import Optional import youtube_dl from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -23,9 +24,10 @@ class Youtube(BaseDownloader): def _download_video(self) -> Resource: with tempfile.TemporaryDirectory() as temp_dir: + download_path = Path(temp_dir).resolve() ydl_opts = { "format": "best", - "outtmpl": str(temp_dir / "test.%(ext)s"), + "outtmpl": str(download_path) + '/' + 'test.%(ext)s', "playlistend": 1, "nooverwrites": True, "quiet": True @@ -33,8 +35,11 @@ class Youtube(BaseDownloader): with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([self.post.url]) - with open(temp_dir / 'test.mp4', 'rb') as file: + downloaded_file = list(download_path.iterdir())[0] + extension = downloaded_file.suffix + with open(downloaded_file, 'rb') as file: content = file.read() - out = Resource(self.post, self.post.url) + out = Resource(self.post, self.post.url, extension) out.content = content + out.create_hash() return out diff --git a/bulkredditdownloader/tests/downloaders/test_youtube.py b/bulkredditdownloader/tests/downloaders/test_youtube.py new file mode 100644 index 0000000..0bbe10d --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_youtube.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.youtube import Youtube + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( + ('ltnoqp', '468136300a106c67f1463a7011a6db4a'), +)) +def test_find_resources(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + downloader = Youtube(test_submission) + resources = downloader.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download() + assert resources[0].hash.hexdigest() == expected_hash From 62e104653d3d5c8cdc7d1f0b280051ee4effe1be Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 28 Feb 2021 21:54:45 +1000 Subject: [PATCH 054/276] Add ffmpeg to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 440a0a9..94652df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ appdirs bs4 +ffmpeg-python requests praw youtube-dl \ No newline at end of file From d096580da705f51517d580f408bd6b69f1202d2c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 1 Mar 2021 09:51:44 +1000 Subject: [PATCH 055/276] Add some tests for DownloadFactory --- .../site_downloaders/download_factory.py | 26 +++++++++---- .../downloaders/test_download_factory.py | 39 ++++++++++++++----- 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index dfe2b2d..19a8507 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -10,25 +10,37 @@ from bulkredditdownloader.site_downloaders.direct import Direct from bulkredditdownloader.site_downloaders.erome import Erome from bulkredditdownloader.site_downloaders.gallery import Gallery from bulkredditdownloader.site_downloaders.gfycat import Gfycat +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.site_downloaders.imgur import Imgur from bulkredditdownloader.site_downloaders.redgifs import Redgifs +from bulkredditdownloader.site_downloaders.self_post import SelfPost +from bulkredditdownloader.site_downloaders.vreddit import VReddit +from bulkredditdownloader.site_downloaders.youtube import Youtube class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: - url_beginning = r'\s*(https?://(www.)?)' - if re.match(url_beginning + r'gfycat.com.*', url): - return Gfycat - elif re.match(url_beginning + r'erome.com.*', url): + url_beginning = r'\s*(https?://(www\.)?)' + if re.match(url_beginning + r'erome\.com.*', url): return Erome + elif re.match(url_beginning + r'reddit\.com/gallery/.*', url): + return Gallery + elif re.match(url_beginning + r'gfycat\.', url): + return Gfycat + elif re.match(url_beginning + r'gifdeliverynetwork', url): + return GifDeliveryNetwork elif re.match(url_beginning + r'imgur.*', url): return Imgur elif re.match(url_beginning + r'redgifs.com', url): return Redgifs - elif re.match(url_beginning + r'[vi].redd\.it.*', url): + elif re.match(url_beginning + r'reddit\.com/r/', url): + return SelfPost + elif re.match(url_beginning + r'v\.redd\.it', url): + return VReddit + elif re.match(url_beginning + r'youtube', url): + return Youtube + elif re.match(url_beginning + r'i\.redd\.it.*', url) or re.match(url_beginning + r'.*\..{3,4}$', url): return Direct - elif re.match(url_beginning + r'reddit.com/gallery/.*', url): - return Gallery else: raise NotADownloadableLinkError('No downloader module exists for url {}'.format(url)) diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 613296a..654c468 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -1,25 +1,44 @@ #!/usr/bin/env python3 # coding=utf-8 +import praw import pytest +from bulkredditdownloader.errors import NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_downloaders.direct import Direct from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory from bulkredditdownloader.site_downloaders.erome import Erome +from bulkredditdownloader.site_downloaders.gallery import Gallery from bulkredditdownloader.site_downloaders.gfycat import Gfycat +from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.site_downloaders.imgur import Imgur from bulkredditdownloader.site_downloaders.redgifs import Redgifs +from bulkredditdownloader.site_downloaders.self_post import SelfPost +from bulkredditdownloader.site_downloaders.vreddit import VReddit +from bulkredditdownloader.site_downloaders.youtube import Youtube -@pytest.mark.parametrize('test_url', ('https://gfycat.com/joyfulpitifulirishterrier', - 'https://gfycat.com/blaringaridjellyfish-jensen-ackles-supernatural')) -def test_factory_gfycat(test_url: str): - result = DownloadFactory.pull_lever(test_url) - assert result is Gfycat +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'expected_class'), ( + ('lu8l8g', VReddit), + ('lu29zn', SelfPost), + ('lu2ykk', Direct), # Imgur direct link + ('luh2pd', Direct), # Reddit direct link + ('lu93m7', Gallery), + ('luf1nu', Gfycat), +)) +def test_factory_lever_good(test_submission_id: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): + submission = reddit_instance.submission(id=test_submission_id) + result = DownloadFactory.pull_lever(submission.url) + assert result is expected_class -@pytest.mark.parametrize('test_url', ('https://www.erome.com/a/bbezvaBn', - 'https://www.erome.com/a/p14JFlnm')) -def test_factory_erome(test_url): - result = DownloadFactory.pull_lever(test_url) - assert result is Erome +@pytest.mark.parametrize('test_url', ( + 'random.com', + 'bad', +)) +def test_factory_lever_bad(test_url: str): + with pytest.raises(NotADownloadableLinkError): + DownloadFactory.pull_lever(test_url) From 62d99a9cad8db0e39752d0ee1fca6513a33a1f95 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 1 Mar 2021 14:05:20 +1000 Subject: [PATCH 056/276] Add rest of tests for DownloadFactory --- .../tests/downloaders/test_download_factory.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 654c468..d722542 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -26,8 +26,15 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube ('lu29zn', SelfPost), ('lu2ykk', Direct), # Imgur direct link ('luh2pd', Direct), # Reddit direct link + ('luo9eo', Direct), # Imgur direct link gif + ('lumulo', Direct), # Imgur direct link gif + ('lui5t3', Imgur), ('lu93m7', Gallery), ('luf1nu', Gfycat), + ('luxmgx', Erome), + ('lupb4r', Youtube), + ('lul6l7', Redgifs), + ('luu376', GifDeliveryNetwork), )) def test_factory_lever_good(test_submission_id: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): submission = reddit_instance.submission(id=test_submission_id) From ad3aeece07494a6db586998ef7d4b8cc3345116a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 1 Mar 2021 14:50:31 +1000 Subject: [PATCH 057/276] Refactor Youtube downloader --- .../site_downloaders/youtube.py | 21 ++++++++++--------- .../tests/downloaders/test_youtube.py | 1 + 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index ff770a3..cd2034b 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -20,19 +20,20 @@ class Youtube(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return [self._download_video()] + ytdl_options = { + "format": "best", + "playlistend": 1, + "nooverwrites": True, + } + out = self._download_video(ytdl_options) + return [out] - def _download_video(self) -> Resource: + def _download_video(self, ytdl_options: dict) -> Resource: + ytdl_options['quiet'] = True with tempfile.TemporaryDirectory() as temp_dir: download_path = Path(temp_dir).resolve() - ydl_opts = { - "format": "best", - "outtmpl": str(download_path) + '/' + 'test.%(ext)s', - "playlistend": 1, - "nooverwrites": True, - "quiet": True - } - with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' + with youtube_dl.YoutubeDL(ytdl_options) as ydl: ydl.download([self.post.url]) downloaded_file = list(download_path.iterdir())[0] diff --git a/bulkredditdownloader/tests/downloaders/test_youtube.py b/bulkredditdownloader/tests/downloaders/test_youtube.py index 0bbe10d..144de18 100644 --- a/bulkredditdownloader/tests/downloaders/test_youtube.py +++ b/bulkredditdownloader/tests/downloaders/test_youtube.py @@ -10,6 +10,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube @pytest.mark.online @pytest.mark.reddit +@pytest.mark.long @pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( ('ltnoqp', '468136300a106c67f1463a7011a6db4a'), )) From 2dce3108a1395930d37aeaf56b2edf832169205c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 1 Mar 2021 14:51:18 +1000 Subject: [PATCH 058/276] Base Vreddit class on Youtube downloader --- .../site_downloaders/vreddit.py | 45 +++---------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bulkredditdownloader/site_downloaders/vreddit.py index 83823fc..4ace7ac 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bulkredditdownloader/site_downloaders/vreddit.py @@ -1,56 +1,21 @@ #!/usr/bin/env python3 import logging -import os -import pathlib -import subprocess -import tempfile from typing import Optional -import requests from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bulkredditdownloader.site_authenticator import SiteAuthenticator +from bulkredditdownloader.site_downloaders.youtube import Youtube logger = logging.getLogger(__name__) -class VReddit(BaseDownloader): +class VReddit(Youtube): def __init__(self, post: Submission): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - try: - fnull = open(os.devnull, 'w') - subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT) - except subprocess.SubprocessError: - return [Resource(self.post, self.post.url)] - else: - video_url = self.post.url - audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4' - - with tempfile.TemporaryDirectory() as temp_dir: - video = requests.get(video_url).content - audio = requests.get(audio_url).content - with open(temp_dir / 'video', 'wb')as file: - file.write(video) - with open(temp_dir / 'audio', 'wb') as file: - file.write(audio) - self._merge_audio(temp_dir) - with open(temp_dir / 'output.mp4', 'rb') as file: - content = file.read() - out = Resource(self.post, self.post.url) - out.content = content - return out - - @staticmethod - def _merge_audio(working_directory: pathlib.Path): - input_video = working_directory / 'video' - input_audio = working_directory / 'audio' - - fnull = open(os.devnull, 'w') - cmd = "ffmpeg -i {} -i {} -c:v copy -c:a aac -strict experimental {}".format( - input_audio, input_video, str(working_directory / 'output.mp4')) - subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT) + out = super()._download_video({}) + return [out] From 7b33ec07fd0d7d6678622d4070aca15bf934aec1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 1 Mar 2021 14:53:07 +1000 Subject: [PATCH 059/276] Add test for Vreddit --- .../tests/downloaders/test_vreddit.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 bulkredditdownloader/tests/downloaders/test_vreddit.py diff --git a/bulkredditdownloader/tests/downloaders/test_vreddit.py b/bulkredditdownloader/tests/downloaders/test_vreddit.py new file mode 100644 index 0000000..bf96d67 --- /dev/null +++ b/bulkredditdownloader/tests/downloaders/test_vreddit.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import pytest + +from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_downloaders.vreddit import VReddit + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( + ('lu8l8g', '93a15642d2f364ae39f00c6d1be354ff'), +)) +def test_find_resources(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + downloader = VReddit(test_submission) + resources = downloader.find_resources() + assert len(resources) == 1 + assert isinstance(resources[0], Resource) + resources[0].download() + assert resources[0].hash.hexdigest() == expected_hash From c01fc39671464e8ae422a9768134601e5f691b14 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 1 Mar 2021 16:41:31 +1000 Subject: [PATCH 060/276] Update arguments --- bulkredditdownloader/__main__.py | 54 +++++--------------------------- 1 file changed, 8 insertions(+), 46 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 79abe3a..ba17e79 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -1,10 +1,5 @@ #!/usr/bin/env python3 -""" -This program downloads imgur, gfycat and direct image and video links of -saved posts from a reddit account. It is written in Python 3. -""" - import argparse import logging import sys @@ -22,34 +17,18 @@ def _add_options(): help="Specifies the directory where posts will be downloaded to", metavar="DIRECTORY") parser.add_argument("--verbose", "-v", - help="Verbose Mode", action="store_true", - default=False) - parser.add_argument("--quit", "-q", - help="Auto quit afer the process finishes", - action="store_true", - default=False) + count=True) parser.add_argument("--link", "-l", help="Get posts from link", action='append', metavar="link") - parser.add_argument("--saved", - action="store_true", - required="--unsave" in sys.argv, - help="Triggers saved mode") - parser.add_argument("--unsave", - action="store_true", - help="Unsaves downloaded posts") parser.add_argument("--submitted", action="store_true", help="Gets posts of --user") parser.add_argument("--upvoted", action="store_true", help="Gets upvoted posts of --user") - parser.add_argument("--log", - help="Takes a log file which created by itself (json files),reads posts and tries " - "downloading them again.", - metavar="LOG FILE") parser.add_argument("--subreddit", nargs="+", help="Triggers subreddit mode and takes subreddit's name without r/. use \"frontpage\" " @@ -59,20 +38,24 @@ def _add_options(): parser.add_argument("--multireddit", help="Triggers multireddit mode and takes multireddit's name without m", metavar="MULTIREDDIT", + action='append', type=str) parser.add_argument("--user", help="reddit username if needed. use \"me\" for current user", required="--multireddit" in sys.argv or "--submitted" in sys.argv, metavar="redditor", + default=None, type=str) parser.add_argument("--search", help="Searches for given query in given subreddits", metavar="query", + default=None, type=str) parser.add_argument("--sort", help="Either hot, top, new, controversial, rising or relevance default: hot", choices=["hot", "top", "new", "controversial", "rising", "relevance"], metavar="SORT TYPE", + default='hot', type=str) parser.add_argument("--limit", help="default: unlimited", @@ -83,53 +66,32 @@ def _add_options(): help="Either hour, day, week, month, year or all. default: all", choices=["all", "hour", "day", "week", "month", "year"], metavar="TIME_LIMIT", + default='all', type=str) parser.add_argument("--skip", nargs="+", help="Skip posts with given type", type=str, - choices=["images", "videos", "gifs", "self"], default=[]) parser.add_argument("--skip-domain", nargs="+", help="Skip posts with given domain", type=str, default=[]) - parser.add_argument("--set-folderpath", + parser.add_argument("--set-folder-scheme", action="store_true", help="Set custom folderpath", default='{SUBREDDIT}' ) - parser.add_argument("--set-filename", + parser.add_argument("--set-file-scheme", action="store_true", help="Set custom filename", default='{REDDITOR}_{TITLE}_{POSTID}' ) - parser.add_argument("--set-default-directory", - action="store_true", - help="Set a default directory to be used in case no directory is given", - ) - parser.add_argument("--set-default-options", - action="store_true", - help="Set default options to use everytime program runs", - ) - parser.add_argument("--use-local-config", - action="store_true", - help="Creates a config file in the program's directory" - " and uses it. Useful for having multiple configs", - ) parser.add_argument("--no-dupes", action="store_true", help="Do not download duplicate posts on different subreddits", ) - parser.add_argument("--downloaded-posts", - help="Use a hash file to keep track of downloaded files", - type=str - ) - parser.add_argument("--no-download", - action="store_true", - help="Just saved posts into a the POSTS.json file without downloading" - ) def _setup_logging(verbosity: int): From ea42471932fdf6938df7475081fc3135063f261d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 2 Mar 2021 14:06:21 +1000 Subject: [PATCH 061/276] Add function to validate formatting strings --- bulkredditdownloader/file_name_formatter.py | 13 +++++++++++++ .../tests/test_file_name_formatter.py | 14 ++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 461947e..6f8fbd6 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -6,12 +6,19 @@ from pathlib import Path import praw.models +from bulkredditdownloader.errors import BulkDownloaderException from bulkredditdownloader.resource import Resource class FileNameFormatter: + key_terms = ('title', 'subreddit', 'redditor', 'postid', 'upvotes', 'flair', 'date') + def __init__(self, file_format_string: str, directory_format_string: str): + if not self.validate_string(file_format_string): + raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string') self.file_format_string = file_format_string + if not self.validate_string(directory_format_string): + raise BulkDownloaderException(f'"{directory_format_string}" is not a valid format string') self.directory_format_string = directory_format_string @staticmethod @@ -38,3 +45,9 @@ class FileNameFormatter: file_path = subfolder / (str(self._format_name(resource.source_submission, self.file_format_string)) + resource.extension) return file_path + + @staticmethod + def validate_string(test_string: str) -> bool: + if not test_string: + return False + return any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms]) diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 7d18dd2..eb679d3 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -42,6 +42,20 @@ def test_format_name_mock(format_string: str, expected: str, submission: Mock): assert result == expected +@pytest.mark.parametrize(('test_string', 'expected'), ( + ('', False), + ('test', False), + ('{POSTID}', True), + ('POSTID', False), + ('{POSTID}_test', True), + ('test_{TITLE}', True), + ('TITLE_POSTID', False), +)) +def test_check_format_string_validity(test_string: str, expected: bool): + result = FileNameFormatter.validate_string(test_string) + assert result == expected + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('format_string', 'expected'), From 9e6ec9f1ca969dd6fa7f6b44e40f0e3e281784a4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 3 Mar 2021 12:53:53 +1000 Subject: [PATCH 062/276] Add some tests for RedditDownloader --- bulkredditdownloader/downloader.py | 27 +- bulkredditdownloader/tests/test_downloader.py | 267 ++++++++++++++++++ 2 files changed, 279 insertions(+), 15 deletions(-) create mode 100644 bulkredditdownloader/tests/test_downloader.py diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index d6a5e5f..56be776 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -13,10 +13,10 @@ import appdirs import praw import praw.models -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.errors import NotADownloadableLinkError, RedditAuthenticationError from bulkredditdownloader.file_name_formatter import FileNameFormatter +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory logger = logging.getLogger(__name__) @@ -107,7 +107,11 @@ class RedditDownloader: if self.args.subreddit: subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in self.args.subreddit] if self.args.search: - return [reddit.search(self.args.search, sort=self.sort_filter.name.lower()) for reddit in subreddits] + return [ + reddit.search( + self.args.search, + sort=self.sort_filter.name.lower(), + limit=self.args.limit) for reddit in subreddits] else: sort_function = self._determine_sort_function() return [sort_function(reddit, limit=self.args.limit) for reddit in subreddits] @@ -116,8 +120,8 @@ class RedditDownloader: def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] - for url in self.args.link: - supplied_submissions.append(self.reddit_instance.submission(url=url)) + for sub_id in self.args.link: + supplied_submissions.append(self.reddit_instance.submission(id=sub_id)) return [supplied_submissions] def _determine_sort_function(self): @@ -162,29 +166,22 @@ class RedditDownloader: return [] def _create_file_name_formatter(self) -> FileNameFormatter: - return FileNameFormatter(self.args.set_filename, self.args.set_folderpath) + return FileNameFormatter(self.args.set_file_scheme, self.args.set_folder_scheme) def _create_time_filter(self) -> RedditTypes.TimeType: try: - return RedditTypes.TimeType[self.args.sort.upper()] + return RedditTypes.TimeType[self.args.time.upper()] except (KeyError, AttributeError): return RedditTypes.TimeType.ALL def _create_sort_filter(self) -> RedditTypes.SortType: try: - return RedditTypes.SortType[self.args.time.upper()] + return RedditTypes.SortType[self.args.sort.upper()] except (KeyError, AttributeError): return RedditTypes.SortType.HOT def _create_download_filter(self) -> DownloadFilter: - formats = { - "videos": [".mp4", ".webm"], - "images": [".jpg", ".jpeg", ".png", ".bmp"], - "gifs": [".gif"], - "self": [] - } - excluded_extensions = [extension for ext_type in self.args.skip for extension in formats.get(ext_type, ())] - return DownloadFilter(excluded_extensions, self.args.skip_domain) + return DownloadFilter(self.args.skip, self.args.skip_domain) def _create_authenticator(self) -> SiteAuthenticator: raise NotImplementedError diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py new file mode 100644 index 0000000..c981ef7 --- /dev/null +++ b/bulkredditdownloader/tests/test_downloader.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import argparse +from pathlib import Path +from unittest.mock import MagicMock + +import praw +import praw.models +import pytest + +from bulkredditdownloader.download_filter import DownloadFilter +from bulkredditdownloader.downloader import RedditDownloader, RedditTypes +from bulkredditdownloader.errors import BulkDownloaderException +from bulkredditdownloader.file_name_formatter import FileNameFormatter +from bulkredditdownloader.site_authenticator import SiteAuthenticator + + +@pytest.fixture() +def args() -> argparse.Namespace: + args = argparse.Namespace() + + args.directory = '.' + args.verbose = 0 + args.link = [] + args.submitted = False + args.upvoted = False + args.subreddit = [] + args.multireddit = [] + args.user = None + args.search = None + args.sort = 'hot' + args.limit = None + args.time = 'all' + args.skip = [] + args.skip_domain = [] + args.set_folder_scheme = '{SUBREDDIT}' + args.set_file_scheme = '{REDDITOR}_{TITLE}_{POSTID}' + args.no_dupes = False + + return args + + +@pytest.fixture() +def downloader_mock(args: argparse.Namespace): + mock_downloader = MagicMock() + mock_downloader.args = args + return mock_downloader + + +def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): + downloader_mock.args.directory = tmp_path / 'test' + RedditDownloader._determine_directories(downloader_mock) + + assert Path(tmp_path / 'test').exists() + assert downloader_mock.logfile_directory == Path(tmp_path / 'test' / 'LOG_FILES') + assert downloader_mock.logfile_directory.exists() + + +@pytest.mark.parametrize(('skip_extensions', 'skip_domains'), ( + ([], []), + (['.test'], ['test.com']), +)) +def test_create_download_filter(skip_extensions: list[str], skip_domains: list[str], downloader_mock: MagicMock): + downloader_mock.args.skip = skip_extensions + downloader_mock.args.skip_domain = skip_domains + result = RedditDownloader._create_download_filter(downloader_mock) + + assert isinstance(result, DownloadFilter) + assert result.excluded_domains == skip_domains + assert result.excluded_extensions == skip_extensions + + +@pytest.mark.parametrize(('test_time', 'expected'), ( + ('all', 'all'), + ('hour', 'hour'), + ('day', 'day'), + ('week', 'week'), + ('random', 'all'), + ('', 'all'), +)) +def test_create_time_filter(test_time: str, expected: str, downloader_mock: MagicMock): + downloader_mock.args.time = test_time + result = RedditDownloader._create_time_filter(downloader_mock) + + assert isinstance(result, RedditTypes.TimeType) + assert result.name.lower() == expected + + +@pytest.mark.parametrize(('test_sort', 'expected'), ( + ('', 'hot'), + ('hot', 'hot'), + ('controversial', 'controversial'), + ('new', 'new'), +)) +def test_create_sort_filter(test_sort: str, expected: str, downloader_mock: MagicMock): + downloader_mock.args.sort = test_sort + result = RedditDownloader._create_sort_filter(downloader_mock) + + assert isinstance(result, RedditTypes.SortType) + assert result.name.lower() == expected + + +@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( + ('{POSTID}', '{SUBREDDIT}'), + ('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}'), +)) +def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): + downloader_mock.args.set_file_scheme = test_file_scheme + downloader_mock.args.set_folder_scheme = test_folder_scheme + result = RedditDownloader._create_file_name_formatter(downloader_mock) + + assert isinstance(result, FileNameFormatter) + assert result.file_format_string == test_file_scheme + assert result.directory_format_string == test_folder_scheme + + +@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( + ('', ''), + ('{POSTID}', ''), + ('', '{SUBREDDIT}'), + ('test', '{SUBREDDIT}'), + ('{POSTID}', 'test'), +)) +def test_create_file_name_formatter_bad(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): + downloader_mock.args.set_file_scheme = test_file_scheme + downloader_mock.args.set_folder_scheme = test_folder_scheme + with pytest.raises(BulkDownloaderException): + RedditDownloader._create_file_name_formatter(downloader_mock) + + +@pytest.mark.skip +def test_create_authenticator(downloader_mock: MagicMock): + result = RedditDownloader._create_authenticator(downloader_mock) + assert isinstance(result, SiteAuthenticator) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_submission_ids', ( + ('lvpf4l',), + ('lvpf4l', 'lvqnsn'), + ('lvpf4l', 'lvqnsn', 'lvl9kd'), +)) +def test_get_submissions_from_link( + test_submission_ids: list[str], + reddit_instance: praw.Reddit, + downloader_mock: MagicMock): + downloader_mock.args.link = test_submission_ids + downloader_mock.reddit_instance = reddit_instance + results = RedditDownloader._get_submissions_from_link(downloader_mock) + assert all([isinstance(sub, praw.models.Submission) for res in results for sub in res]) + assert len(results[0]) == len(test_submission_ids) + + +@pytest.mark.skip +def test_load_config(downloader_mock: MagicMock): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_subreddits', 'limit'), ( + (('Futurology',), 10), + (('Futurology',), 20), + (('Futurology', 'Python'), 10), + (('Futurology',), 100), + (('Futurology',), 0), +)) +def test_get_subreddit_normal( + test_subreddits: list[str], + limit: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.args.subreddit = test_subreddits + downloader_mock.args.limit = limit + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + results = RedditDownloader._get_subreddits(downloader_mock) + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) + assert all([res.subreddit.display_name for res in results]) + if limit is not None: + assert len(results) == (limit * len(test_subreddits)) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit'), ( + (('Python',), 'scraper', 10), + (('Python',), '', 10), +)) +def test_get_subreddit_search( + test_subreddits: list[str], + search_term: str, + limit: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.args.subreddit = test_subreddits + downloader_mock.args.limit = limit + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + downloader_mock.args.search = search_term + results = RedditDownloader._get_subreddits(downloader_mock) + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) + assert all([res.subreddit.display_name for res in results]) + if limit is not None: + assert len(results) == (limit * len(test_subreddits)) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_get_subreddits_search_bad(): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_get_multireddits(): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_get_user_submissions(): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_get_user_upvoted(): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_get_user_saved(): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_download_submission(): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_download_submission_file_exists(): + raise NotImplementedError + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_download_submission_hash_exists(): + raise NotImplementedError From ac08a639bae77bbaac307d689090e26b60be6387 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 4 Mar 2021 09:14:43 +1000 Subject: [PATCH 063/276] Tighten exception block --- bulkredditdownloader/downloader.py | 45 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 56be776..c9149a8 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -194,29 +194,32 @@ class RedditDownloader: def _download_submission(self, submission: praw.models.Submission): if self.download_filter.check_url(submission.url): logger.debug('Attempting to download submission {}'.format(submission.id)) + try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) - if self.args.no_download: - logger.info('Skipping download for submission {}'.format(submission.id)) - else: - content = downloader.find_resources(self.authenticator) - for res in content: - destination = self.file_name_formatter.format_path(res, self.download_directory) - if destination.exists(): - logger.debug('File already exists: {}'.format(destination)) - else: - if res.hash.hexdigest() not in self.master_hash_list and not self.args.no_dupes: - # TODO: consider making a hard link/symlink here - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug('Written file to {}'.format(destination)) - self.master_hash_list.append(res.hash.hexdigest()) - logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) - else: - logger.debug(f'Resource from {res.url} downloaded elsewhere') - - logger.info('Downloaded submission {}'.format(submission.name)) except NotADownloadableLinkError as e: logger.error('Could not download submission {}: {}'.format(submission.name, e)) + return + + if self.args.no_download: + logger.info('Skipping download for submission {}'.format(submission.id)) + else: + content = downloader.find_resources(self.authenticator) + for res in content: + destination = self.file_name_formatter.format_path(res, self.download_directory) + if destination.exists(): + logger.debug('File already exists: {}'.format(destination)) + else: + if res.hash.hexdigest() not in self.master_hash_list and not self.args.no_dupes: + # TODO: consider making a hard link/symlink here + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug('Written file to {}'.format(destination)) + self.master_hash_list.append(res.hash.hexdigest()) + logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) + else: + logger.debug(f'Resource from {res.url} downloaded elsewhere') + + logger.info('Downloaded submission {}'.format(submission.name)) From 5e914b52346f4e0338d33248b19f1fb5c2d40d1a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 5 Mar 2021 13:29:57 +1000 Subject: [PATCH 064/276] Re-add missing argument --- bulkredditdownloader/__main__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index ba17e79..754d765 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -26,6 +26,9 @@ def _add_options(): parser.add_argument("--submitted", action="store_true", help="Gets posts of --user") + parser.add_argument("--saved", + action="store_true", + help="Gets upvoted posts of --user") parser.add_argument("--upvoted", action="store_true", help="Gets upvoted posts of --user") From 6f86dbd552ea62fa0747329989c515bd022cdd07 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 5 Mar 2021 13:30:17 +1000 Subject: [PATCH 065/276] Add error --- bulkredditdownloader/errors.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/errors.py b/bulkredditdownloader/errors.py index c677b38..703ffaa 100644 --- a/bulkredditdownloader/errors.py +++ b/bulkredditdownloader/errors.py @@ -4,7 +4,11 @@ class BulkDownloaderException(Exception): pass -class RedditAuthenticationError(BulkDownloaderException): +class RedditUserError(BulkDownloaderException): + pass + + +class RedditAuthenticationError(RedditUserError): pass From b705c3163020dfcd7c7a2f929a1bc66616e573fe Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 5 Mar 2021 13:31:40 +1000 Subject: [PATCH 066/276] Add some more tests for RedditDownloader --- bulkredditdownloader/downloader.py | 66 +++++--- bulkredditdownloader/tests/test_downloader.py | 143 ++++++++++++++---- 2 files changed, 158 insertions(+), 51 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index c9149a8..e083974 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -8,13 +8,15 @@ import socket from datetime import datetime from enum import Enum, auto from pathlib import Path +from typing import Iterator import appdirs import praw import praw.models +import prawcore +import bulkredditdownloader.errors as errors from bulkredditdownloader.download_filter import DownloadFilter -from bulkredditdownloader.errors import NotADownloadableLinkError, RedditAuthenticationError from bulkredditdownloader.file_name_formatter import FileNameFormatter from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory @@ -54,6 +56,7 @@ class RedditDownloader: self.sort_filter = self._create_sort_filter() self.file_name_formatter = self._create_file_name_formatter() self.authenticator = self._create_authenticator() + self._resolve_user_name() self._determine_directories() self._create_file_logger() self.master_hash_list = [] @@ -118,6 +121,10 @@ class RedditDownloader: else: return [] + def _resolve_user_name(self): + if self.args.user == 'me': + self.args.user = self.reddit_instance.user.me() + def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] for sub_id in self.args.link: @@ -135,35 +142,52 @@ class RedditDownloader: sort_function = praw.models.Subreddit.hot return sort_function - def _get_multireddits(self) -> list[praw.models.ListingGenerator]: + def _get_multireddits(self) -> list[Iterator]: if self.args.multireddit: if self.authenticated: - return [self.reddit_instance.multireddit(m_reddit_choice) for m_reddit_choice in self.args.multireddit] + if self.args.user: + sort_function = self._determine_sort_function() + return [ + sort_function(self.reddit_instance.multireddit( + self.args.user, + m_reddit_choice), limit=self.args.limit) for m_reddit_choice in self.args.multireddit] + else: + raise errors.BulkDownloaderException('A user must be provided to download a multireddit') else: - raise RedditAuthenticationError('Accessing multireddits requires authentication') + raise errors.RedditAuthenticationError('Accessing multireddits requires authentication') else: return [] - def _get_user_data(self) -> list[praw.models.ListingGenerator]: - if any((self.args.upvoted, self.args.submitted, self.args.saved)): - if self.authenticated: - generators = [] - sort_function = self._determine_sort_function() + def _get_user_data(self) -> list[Iterator]: + if self.args.user: + if not self._check_user_existence(self.args.user): + raise errors.RedditUserError(f'User {self.args.user} does not exist') + generators = [] + sort_function = self._determine_sort_function() + if self.args.submitted: + generators.append( + sort_function( + self.reddit_instance.redditor(self.args.user).submissions, + limit=self.args.limit)) + if not self.authenticated and any((self.args.upvoted, self.args.saved)): + raise errors.RedditAuthenticationError('Accessing user lists requires authentication') + else: if self.args.upvoted: generators.append(self.reddit_instance.redditor(self.args.user).upvoted) - if self.args.submitted: - generators.append( - sort_function( - self.reddit_instance.redditor(self.args.user).submissions, - limit=self.args.limit)) if self.args.saved: generators.append(self.reddit_instance.redditor(self.args.user).saved) - - return generators - else: - raise RedditAuthenticationError('Accessing user lists requires authentication') + return generators else: - return [] + raise errors.BulkDownloaderException('A user must be supplied to download user data') + + def _check_user_existence(self, name: str) -> bool: + user = self.reddit_instance.redditor(name=name) + try: + if not user.id: + return False + except prawcore.exceptions.NotFound: + return False + return True def _create_file_name_formatter(self) -> FileNameFormatter: return FileNameFormatter(self.args.set_file_scheme, self.args.set_folder_scheme) @@ -198,10 +222,10 @@ class RedditDownloader: try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) - except NotADownloadableLinkError as e: + except errors.NotADownloadableLinkError as e: logger.error('Could not download submission {}: {}'.format(submission.name, e)) return - + if self.args.no_download: logger.info('Skipping download for submission {}'.format(submission.id)) else: diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index c981ef7..1aa58c6 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -3,6 +3,7 @@ import argparse from pathlib import Path +from typing import Iterator from unittest.mock import MagicMock import praw @@ -11,7 +12,7 @@ import pytest from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.downloader import RedditDownloader, RedditTypes -from bulkredditdownloader.errors import BulkDownloaderException +from bulkredditdownloader.errors import BulkDownloaderException, RedditAuthenticationError, RedditUserError from bulkredditdownloader.file_name_formatter import FileNameFormatter from bulkredditdownloader.site_authenticator import SiteAuthenticator @@ -25,6 +26,7 @@ def args() -> argparse.Namespace: args.link = [] args.submitted = False args.upvoted = False + args.saved = False args.subreddit = [] args.multireddit = [] args.user = None @@ -48,6 +50,14 @@ def downloader_mock(args: argparse.Namespace): return mock_downloader +def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]): + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) + if result_limit is not None: + assert len(results) == result_limit + return results + + def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): downloader_mock.args.directory = tmp_path / 'test' RedditDownloader._determine_directories(downloader_mock) @@ -172,17 +182,15 @@ def test_get_subreddit_normal( limit: int, downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.reddit_instance = reddit_instance - downloader_mock.args.subreddit = test_subreddits - downloader_mock.args.limit = limit downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.args.limit = limit + downloader_mock.args.subreddit = test_subreddits + downloader_mock.reddit_instance = reddit_instance downloader_mock.sort_filter = RedditTypes.SortType.HOT results = RedditDownloader._get_subreddits(downloader_mock) - results = [sub for res in results for sub in res] - assert all([isinstance(res, praw.models.Submission) for res in results]) - assert all([res.subreddit.display_name for res in results]) - if limit is not None: - assert len(results) == (limit * len(test_subreddits)) + results = assert_all_results_are_submissions( + (limit * len(test_subreddits)) if limit else None, results) + assert all([res.subreddit.display_name in test_subreddits for res in results]) @pytest.mark.online @@ -190,6 +198,7 @@ def test_get_subreddit_normal( @pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit'), ( (('Python',), 'scraper', 10), (('Python',), '', 10), + (('Python',), 'djsdsgewef', 0), )) def test_get_subreddit_search( test_subreddits: list[str], @@ -197,39 +206,99 @@ def test_get_subreddit_search( limit: int, downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.reddit_instance = reddit_instance + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.args.limit = limit + downloader_mock.args.search = search_term downloader_mock.args.subreddit = test_subreddits + downloader_mock.reddit_instance = reddit_instance + downloader_mock.sort_filter = RedditTypes.SortType.HOT + results = RedditDownloader._get_subreddits(downloader_mock) + results = assert_all_results_are_submissions( + (limit * len(test_subreddits)) if limit else None, results) + assert all([res.subreddit.display_name in test_subreddits for res in results]) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_user', 'test_multireddits', 'limit'), ( + ('helen_darten', ('cuteanimalpics',), 10), + ('korfor', ('chess',), 100), +)) +# Good sources at https://www.reddit.com/r/multihub/ +def test_get_multireddits_public( + test_user: str, + test_multireddits: list[str], + limit: int, + reddit_instance: praw.Reddit, + downloader_mock: MagicMock): + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + downloader_mock.args.limit = limit + downloader_mock.args.multireddit = test_multireddits + downloader_mock.args.user = test_user + downloader_mock.reddit_instance = reddit_instance + results = RedditDownloader._get_multireddits(downloader_mock) + assert_all_results_are_submissions((limit * len(test_multireddits)) if limit else None, results) + + +@pytest.mark.online +@pytest.mark.reddit +def test_get_multireddits_no_user(downloader_mock: MagicMock, reddit_instance: praw.Reddit): + downloader_mock.args.multireddit = ['test'] + with pytest.raises(BulkDownloaderException): + RedditDownloader._get_multireddits(downloader_mock) + + +@pytest.mark.online +@pytest.mark.reddit +def test_get_multireddits_not_authenticated(downloader_mock: MagicMock, reddit_instance: praw.Reddit): + downloader_mock.args.multireddit = ['test'] + downloader_mock.authenticated = False + downloader_mock.reddit_instance = reddit_instance + with pytest.raises(RedditAuthenticationError): + RedditDownloader._get_multireddits(downloader_mock) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_user', 'limit'), ( + ('danigirl3694', 10), + ('danigirl3694', 50), + ('CapitanHam', None), +)) +def test_get_user_submissions(test_user: str, limit: int, downloader_mock: MagicMock, reddit_instance: praw.Reddit): downloader_mock.args.limit = limit downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.sort_filter = RedditTypes.SortType.HOT - downloader_mock.args.search = search_term - results = RedditDownloader._get_subreddits(downloader_mock) - results = [sub for res in results for sub in res] - assert all([isinstance(res, praw.models.Submission) for res in results]) - assert all([res.subreddit.display_name for res in results]) - if limit is not None: - assert len(results) == (limit * len(test_subreddits)) + downloader_mock.args.submitted = True + downloader_mock.args.user = test_user + downloader_mock.authenticated = False + downloader_mock.reddit_instance = reddit_instance + results = RedditDownloader._get_user_data(downloader_mock) + results = assert_all_results_are_submissions(limit, results) + assert all([res.author.name == test_user for res in results]) @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_get_subreddits_search_bad(): - raise NotImplementedError +def test_get_user_no_user(downloader_mock: MagicMock): + with pytest.raises(BulkDownloaderException): + RedditDownloader._get_user_data(downloader_mock) @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_get_multireddits(): - raise NotImplementedError - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.skip -def test_get_user_submissions(): - raise NotImplementedError +@pytest.mark.parametrize('test_user', ( + 'rockcanopicjartheme', + 'exceptionalcatfishracecarbatter', +)) +def test_get_user_nonexistent_user(test_user: str, downloader_mock: MagicMock, reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.args.user = test_user + downloader_mock._check_user_existence.return_value = RedditDownloader._check_user_existence( + downloader_mock, test_user) + with pytest.raises(RedditUserError): + RedditDownloader._get_user_data(downloader_mock) @pytest.mark.online @@ -239,6 +308,13 @@ def test_get_user_upvoted(): raise NotImplementedError +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_get_user_upvoted_unauthenticated(): + raise NotImplementedError + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skip @@ -246,6 +322,13 @@ def test_get_user_saved(): raise NotImplementedError +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skip +def test_get_user_saved_unauthenticated(): + raise NotImplementedError + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skip From aeb9afdc665b2c7044de7083f056fefbf16f2a4e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 5 Mar 2021 13:32:24 +1000 Subject: [PATCH 067/276] Rename file with custom exceptions --- bulkredditdownloader/__main__.py | 2 +- bulkredditdownloader/downloader.py | 2 +- bulkredditdownloader/{errors.py => exceptions.py} | 0 bulkredditdownloader/file_name_formatter.py | 2 +- bulkredditdownloader/resource.py | 2 +- bulkredditdownloader/site_downloaders/download_factory.py | 2 +- bulkredditdownloader/site_downloaders/erome.py | 2 +- bulkredditdownloader/site_downloaders/gallery.py | 2 +- bulkredditdownloader/site_downloaders/gif_delivery_network.py | 2 +- bulkredditdownloader/site_downloaders/imgur.py | 2 +- bulkredditdownloader/site_downloaders/redgifs.py | 2 +- bulkredditdownloader/tests/downloaders/test_download_factory.py | 2 +- bulkredditdownloader/tests/downloaders/test_imgur.py | 2 +- bulkredditdownloader/tests/test_downloader.py | 2 +- 14 files changed, 13 insertions(+), 13 deletions(-) rename bulkredditdownloader/{errors.py => exceptions.py} (100%) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 754d765..36c85cf 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -5,7 +5,7 @@ import logging import sys from bulkredditdownloader.downloader import RedditDownloader -from bulkredditdownloader.errors import BulkDownloaderException +from bulkredditdownloader.exceptions import BulkDownloaderException logger = logging.getLogger() parser = argparse.ArgumentParser(allow_abbrev=False, diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index e083974..196dae4 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -15,7 +15,7 @@ import praw import praw.models import prawcore -import bulkredditdownloader.errors as errors +import bulkredditdownloader.exceptions as errors from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.file_name_formatter import FileNameFormatter from bulkredditdownloader.site_authenticator import SiteAuthenticator diff --git a/bulkredditdownloader/errors.py b/bulkredditdownloader/exceptions.py similarity index 100% rename from bulkredditdownloader/errors.py rename to bulkredditdownloader/exceptions.py diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 6f8fbd6..9cd20fe 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -6,7 +6,7 @@ from pathlib import Path import praw.models -from bulkredditdownloader.errors import BulkDownloaderException +from bulkredditdownloader.exceptions import BulkDownloaderException from bulkredditdownloader.resource import Resource diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index a944b3d..ea6a692 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -10,7 +10,7 @@ import _hashlib import requests from praw.models import Submission -from bulkredditdownloader.errors import BulkDownloaderException +from bulkredditdownloader.exceptions import BulkDownloaderException class Resource: diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 19a8507..db6adca 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -4,7 +4,7 @@ import re from typing import Type -from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct from bulkredditdownloader.site_downloaders.erome import Erome diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 9f62738..762e8f9 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -10,7 +10,7 @@ from typing import Optional from praw.models import Submission from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 012295f..449d853 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -8,7 +8,7 @@ import requests from praw.models import Submission from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound +from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index ba1dc41..de627a7 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -6,7 +6,7 @@ import requests from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 7804de9..af09c3f 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -8,7 +8,7 @@ import requests from praw.models import Submission from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.errors import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError +from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index e5816b7..426378a 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -7,7 +7,7 @@ import requests from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index d722542..393e239 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -4,7 +4,7 @@ import praw import pytest -from bulkredditdownloader.errors import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.direct import Direct from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory diff --git a/bulkredditdownloader/tests/downloaders/test_imgur.py b/bulkredditdownloader/tests/downloaders/test_imgur.py index b89bbb8..6f6a9f4 100644 --- a/bulkredditdownloader/tests/downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/downloaders/test_imgur.py @@ -5,7 +5,7 @@ from unittest.mock import Mock import pytest -from bulkredditdownloader.errors import SiteDownloaderError +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.imgur import Imgur diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 1aa58c6..3e784bc 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -12,7 +12,7 @@ import pytest from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.downloader import RedditDownloader, RedditTypes -from bulkredditdownloader.errors import BulkDownloaderException, RedditAuthenticationError, RedditUserError +from bulkredditdownloader.exceptions import BulkDownloaderException, RedditAuthenticationError, RedditUserError from bulkredditdownloader.file_name_formatter import FileNameFormatter from bulkredditdownloader.site_authenticator import SiteAuthenticator From 5a2e045c77feeee947a672878c499190ac94df49 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 11:37:01 +1000 Subject: [PATCH 068/276] Add OAuth2 class --- bulkredditdownloader/oauth2.py | 88 +++++++++++++++++++++++ bulkredditdownloader/tests/test_oauth2.py | 56 +++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 bulkredditdownloader/oauth2.py create mode 100644 bulkredditdownloader/tests/test_oauth2.py diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py new file mode 100644 index 0000000..67444d8 --- /dev/null +++ b/bulkredditdownloader/oauth2.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import configparser +import logging +import random +import socket + +import praw +import requests + +from bulkredditdownloader.exceptions import BulkDownloaderException, RedditAuthenticationError + +logger = logging.getLogger(__name__) + + +class OAuth2Authenticator: + + def __init__(self, wanted_scopes: list[str]): + self._check_scopes(wanted_scopes) + self.scopes = wanted_scopes + + @staticmethod + def _check_scopes(wanted_scopes: list[str]): + response = requests.get('https://www.reddit.com/api/v1/scopes.json', + headers={'User-Agent': 'fetch-scopes test'}) + known_scopes = [scope for scope, data in response.json().items()] + known_scopes.append('*') + for scope in wanted_scopes: + if scope not in known_scopes: + raise BulkDownloaderException(f'Scope {scope} is not known to reddit') + + def retrieve_new_token(self) -> str: + reddit = praw.Reddit(redirect_uri='http://localhost:8080', user_agent='obtain_refresh_token for BDFR') + state = str(random.randint(0, 65000)) + url = reddit.auth.url(self.scopes, state, 'permanent') + logger.warning('Authentication action required before the program can proceed') + logger.warning(f'Authenticate at {url}') + + client = self.receive_connection() + data = client.recv(1024).decode('utf-8') + param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&') + params = {key: value for (key, value) in [token.split('=') for token in param_tokens]} + + if state != params['state']: + self.send_message(client) + raise RedditAuthenticationError(f'State mismatch in OAuth2. Expected: {state} Received: {params["state"]}') + elif 'error' in params: + self.send_message(client) + raise RedditAuthenticationError(f'Error in OAuth2: {params["error"]}') + + refresh_token = reddit.auth.authorize(params["code"]) + return refresh_token + + @staticmethod + def receive_connection() -> socket.socket: + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind(('localhost', 8080)) + logger.debug('Server listening on localhost:8080') + + server.listen(1) + client = server.accept()[0] + server.close() + logger.debug('Server closed') + + return client + + @staticmethod + def send_message(client: socket.socket): + client.send('HTTP/1.1 200 OK'.encode('utf-8')) + client.close() + + +class OAuth2TokenManager(praw.reddit.BaseTokenManager): + def __init__(self, config: configparser.ConfigParser): + super(OAuth2TokenManager, self).__init__() + self.config = config + + def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer): + if authorizer.refresh_token is None: + if self.config.has_option('DEFAULT', 'user_token'): + authorizer.refresh_token = self.config.get('DEFAULT', 'user_token') + else: + raise RedditAuthenticationError('No auth token loaded in configuration') + + def post_refresh_callback(self, authorizer: praw.reddit.Authorizer): + self.config.set('DEFAULT', 'user_token', authorizer.refresh_token) diff --git a/bulkredditdownloader/tests/test_oauth2.py b/bulkredditdownloader/tests/test_oauth2.py new file mode 100644 index 0000000..a80d7a7 --- /dev/null +++ b/bulkredditdownloader/tests/test_oauth2.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import configparser +from unittest.mock import MagicMock + +import praw +import pytest + +from bulkredditdownloader.exceptions import BulkDownloaderException +from bulkredditdownloader.oauth2 import OAuth2Authenticator, OAuth2TokenManager + + +@pytest.fixture() +def example_config() -> configparser.ConfigParser: + out = configparser.ConfigParser() + config_dict = {'DEFAULT': {'user_token': 'example'}} + out.read_dict(config_dict) + return out + + +@pytest.mark.online +@pytest.mark.parametrize('test_scopes', ( + ('history',), + ('history', 'creddits'), + ('account', 'flair'), + ('*',), +)) +def test_check_scopes(test_scopes: list[str]): + OAuth2Authenticator._check_scopes(test_scopes) + + +@pytest.mark.online +@pytest.mark.parametrize('test_scopes', ( + ('random',), + ('scope', 'another_scope'), +)) +def test_check_scopes_bad(test_scopes: list[str]): + with pytest.raises(BulkDownloaderException): + OAuth2Authenticator._check_scopes(test_scopes) + + +def test_token_manager_read(example_config: configparser.ConfigParser): + mock_authoriser = MagicMock() + mock_authoriser.refresh_token = None + test_manager = OAuth2TokenManager(example_config) + test_manager.pre_refresh_callback(mock_authoriser) + assert mock_authoriser.refresh_token == example_config.get('DEFAULT', 'user_token') + + +def test_token_manager_write(example_config: configparser.ConfigParser): + mock_authoriser = MagicMock() + mock_authoriser.refresh_token = 'changed_token' + test_manager = OAuth2TokenManager(example_config) + test_manager.post_refresh_callback(mock_authoriser) + assert example_config.get('DEFAULT', 'user_token') == 'changed_token' From 1422591bf46c36f55c14cfb594a3d8ab562d2fb8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:09:39 +1000 Subject: [PATCH 069/276] Change quotes --- bulkredditdownloader/__main__.py | 119 ++++++++++++++++--------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 36c85cf..f2eadfe 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -9,91 +9,92 @@ from bulkredditdownloader.exceptions import BulkDownloaderException logger = logging.getLogger() parser = argparse.ArgumentParser(allow_abbrev=False, - description="This program downloads media from reddit posts") + description='This program downloads media from reddit posts') def _add_options(): - parser.add_argument("directory", - help="Specifies the directory where posts will be downloaded to", - metavar="DIRECTORY") - parser.add_argument("--verbose", "-v", - action="store_true", + parser.add_argument('directory', + help='Specifies the directory where posts will be downloaded to', + metavar='DIRECTORY') + parser.add_argument('--verbose', '-v', + action='store_true', count=True) - parser.add_argument("--link", "-l", - help="Get posts from link", + parser.add_argument('--link', '-l', + help='Get posts from link', action='append', - metavar="link") - parser.add_argument("--submitted", - action="store_true", - help="Gets posts of --user") - parser.add_argument("--saved", - action="store_true", - help="Gets upvoted posts of --user") - parser.add_argument("--upvoted", - action="store_true", - help="Gets upvoted posts of --user") - parser.add_argument("--subreddit", - nargs="+", - help="Triggers subreddit mode and takes subreddit's name without r/. use \"frontpage\" " - "for frontpage", - metavar="SUBREDDIT", + metavar='link') + parser.add_argument('--submitted', + action='store_true', + help='Gets posts of --user') + parser.add_argument('--saved', + action='store_true', + help='Gets upvoted posts of --user') + parser.add_argument('--upvoted', + action='store_true', + help='Gets upvoted posts of --user') + parser.add_argument('--subreddit', + nargs='+', + help='Triggers subreddit mode and takes subreddit name. use \"frontpage\" for frontpage', + metavar='SUBREDDIT', type=str) - parser.add_argument("--multireddit", - help="Triggers multireddit mode and takes multireddit's name without m", - metavar="MULTIREDDIT", + parser.add_argument('--multireddit', + help='Triggers multireddit mode and takes multireddit name', + metavar='MULTIREDDIT', action='append', type=str) - parser.add_argument("--user", - help="reddit username if needed. use \"me\" for current user", - required="--multireddit" in sys.argv or "--submitted" in sys.argv, - metavar="redditor", + parser.add_argument('--authenticate', + action='store_true') + parser.add_argument('--user', + help='reddit username if needed. use "me" for current user', + required='--multireddit' in sys.argv or '--submitted' in sys.argv, + metavar='redditor', default=None, type=str) - parser.add_argument("--search", - help="Searches for given query in given subreddits", - metavar="query", + parser.add_argument('--search', + help='Searches for given query in given subreddits', + metavar='query', default=None, type=str) - parser.add_argument("--sort", - help="Either hot, top, new, controversial, rising or relevance default: hot", - choices=["hot", "top", "new", "controversial", "rising", "relevance"], - metavar="SORT TYPE", + parser.add_argument('--sort', + help='Either hot, top, new, controversial, rising or relevance default: hot', + choices=['hot', 'top', 'new', 'controversial', 'rising', 'relevance'], + metavar='SORT TYPE', default='hot', type=str) - parser.add_argument("--limit", - help="default: unlimited", - metavar="Limit", + parser.add_argument('--limit', + help='default: unlimited', + metavar='Limit', default=None, type=int) - parser.add_argument("--time", - help="Either hour, day, week, month, year or all. default: all", - choices=["all", "hour", "day", "week", "month", "year"], - metavar="TIME_LIMIT", + parser.add_argument('--time', + help='Either hour, day, week, month, year or all. default: all', + choices=['all', 'hour', 'day', 'week', 'month', 'year'], + metavar='TIME_LIMIT', default='all', type=str) - parser.add_argument("--skip", - nargs="+", - help="Skip posts with given type", + parser.add_argument('--skip', + nargs='+', + help='Skip posts with given type', type=str, default=[]) - parser.add_argument("--skip-domain", - nargs="+", - help="Skip posts with given domain", + parser.add_argument('--skip-domain', + nargs='+', + help='Skip posts with given domain', type=str, default=[]) - parser.add_argument("--set-folder-scheme", - action="store_true", - help="Set custom folderpath", + parser.add_argument('--set-folder-scheme', + action='store_true', + help='Set custom folderpath', default='{SUBREDDIT}' ) - parser.add_argument("--set-file-scheme", - action="store_true", - help="Set custom filename", + parser.add_argument('--set-file-scheme', + action='store_true', + help='Set custom filename', default='{REDDITOR}_{TITLE}_{POSTID}' ) - parser.add_argument("--no-dupes", - action="store_true", - help="Do not download duplicate posts on different subreddits", + parser.add_argument('--no-dupes', + action='store_true', + help='Do not download duplicate posts on different subreddits', ) From cf1029de80814d49ba5f32552674c0501697dae0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:32:08 +1000 Subject: [PATCH 070/276] Fix error with verbose argument settings --- bulkredditdownloader/__main__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index f2eadfe..e44b89d 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -17,8 +17,9 @@ def _add_options(): help='Specifies the directory where posts will be downloaded to', metavar='DIRECTORY') parser.add_argument('--verbose', '-v', - action='store_true', - count=True) + action='count', + default=0, + ) parser.add_argument('--link', '-l', help='Get posts from link', action='append', From 33f49474569d2ec4be0fff57d86e17ce8673c992 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:32:30 +1000 Subject: [PATCH 071/276] Add default scopes to configuration file --- default_config.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/default_config.cfg b/default_config.cfg index 0b1b606..f9a3f84 100644 --- a/default_config.cfg +++ b/default_config.cfg @@ -1,3 +1,4 @@ [DEFAULT] client_id = U-6gk4ZCh3IeNQ -client_secret = 7CZHY6AmKweZME5s50SfDGylaPg \ No newline at end of file +client_secret = 7CZHY6AmKweZME5s50SfDGylaPg +scopes = identity, history, read, save \ No newline at end of file From dd1831b0ea5cb12a2f1e98523fe14c8e4332c76a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:33:11 +1000 Subject: [PATCH 072/276] Remove unimplemented test --- bulkredditdownloader/tests/test_downloader.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 3e784bc..5c49531 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -163,11 +163,6 @@ def test_get_submissions_from_link( assert len(results[0]) == len(test_submission_ids) -@pytest.mark.skip -def test_load_config(downloader_mock: MagicMock): - raise NotImplementedError - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_subreddits', 'limit'), ( From 7c2b7b0e83ed4002812131a1033712a55ce097d6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:34:03 +1000 Subject: [PATCH 073/276] Move scope regex parsing --- bulkredditdownloader/oauth2.py | 8 +++++++- bulkredditdownloader/tests/test_oauth2.py | 12 +++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py index 67444d8..c34b54a 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bulkredditdownloader/oauth2.py @@ -4,6 +4,7 @@ import configparser import logging import random +import re import socket import praw @@ -21,7 +22,7 @@ class OAuth2Authenticator: self.scopes = wanted_scopes @staticmethod - def _check_scopes(wanted_scopes: list[str]): + def _check_scopes(wanted_scopes: set[str]): response = requests.get('https://www.reddit.com/api/v1/scopes.json', headers={'User-Agent': 'fetch-scopes test'}) known_scopes = [scope for scope, data in response.json().items()] @@ -30,6 +31,11 @@ class OAuth2Authenticator: if scope not in known_scopes: raise BulkDownloaderException(f'Scope {scope} is not known to reddit') + @staticmethod + def split_scopes(scopes: str) -> set[str]: + scopes = re.split(r'[,: ]+', scopes) + return set(scopes) + def retrieve_new_token(self) -> str: reddit = praw.Reddit(redirect_uri='http://localhost:8080', user_agent='obtain_refresh_token for BDFR') state = str(random.randint(0, 65000)) diff --git a/bulkredditdownloader/tests/test_oauth2.py b/bulkredditdownloader/tests/test_oauth2.py index a80d7a7..c5a63b7 100644 --- a/bulkredditdownloader/tests/test_oauth2.py +++ b/bulkredditdownloader/tests/test_oauth2.py @@ -4,7 +4,6 @@ import configparser from unittest.mock import MagicMock -import praw import pytest from bulkredditdownloader.exceptions import BulkDownloaderException @@ -30,6 +29,17 @@ def test_check_scopes(test_scopes: list[str]): OAuth2Authenticator._check_scopes(test_scopes) +@pytest.mark.parametrize(('test_scopes', 'expected'), ( + ('history', {'history', }), + ('history creddits', {'history', 'creddits'}), + ('history, creddits, account', {'history', 'creddits', 'account'}), + ('history,creddits,account,flair', {'history', 'creddits', 'account', 'flair'}), +)) +def test_split_scopes(test_scopes: str, expected: set[str]): + result = OAuth2Authenticator.split_scopes(test_scopes) + assert result == expected + + @pytest.mark.online @pytest.mark.parametrize('test_scopes', ( ('random',), From 95876b340045f3cc2dc2e2f3f4e1719882a42860 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:34:52 +1000 Subject: [PATCH 074/276] Fix typing --- bulkredditdownloader/oauth2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py index c34b54a..7945c1f 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bulkredditdownloader/oauth2.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) class OAuth2Authenticator: - def __init__(self, wanted_scopes: list[str]): + def __init__(self, wanted_scopes: set[str]): self._check_scopes(wanted_scopes) self.scopes = wanted_scopes From 36f516e3f0433bdebe7e86e90814d369dba7ae57 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:35:34 +1000 Subject: [PATCH 075/276] Re-implement OAuth2 --- bulkredditdownloader/downloader.py | 37 ++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 196dae4..d32572d 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -4,6 +4,7 @@ import argparse import configparser import logging +import re import socket from datetime import datetime from enum import Enum, auto @@ -18,6 +19,7 @@ import prawcore import bulkredditdownloader.exceptions as errors from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.file_name_formatter import FileNameFormatter +from bulkredditdownloader.oauth2 import OAuth2Authenticator, OAuth2TokenManager from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory @@ -44,7 +46,7 @@ class RedditTypes: class RedditDownloader: def __init__(self, args: argparse.Namespace): self.args = args - self.config_directories = appdirs.AppDirs('bulk_reddit_downloader') + self.config_directories = appdirs.AppDirs('bulk_reddit_downloader', 'BDFR') self.run_time = datetime.now().isoformat() self._setup_internal_objects() @@ -55,19 +57,31 @@ class RedditDownloader: self.time_filter = self._create_time_filter() self.sort_filter = self._create_sort_filter() self.file_name_formatter = self._create_file_name_formatter() - self.authenticator = self._create_authenticator() + # self.authenticator = self._create_authenticator() + self._resolve_user_name() self._determine_directories() self._create_file_logger() self.master_hash_list = [] self._load_config() - if self.cfg_parser.has_option('DEFAULT', 'reddit_token'): - # TODO: implement OAuth2 authentication + + self._create_reddit_instance() + + def _create_reddit_instance(self): + if self.args.authenticate: + if not self.cfg_parser.has_option('DEFAULT', 'user_token'): + scopes = self.cfg_parser.get('DEFAULT', 'scopes') + scopes = OAuth2Authenticator.split_scopes(scopes) + oauth2_authenticator = OAuth2Authenticator(scopes) + token = oauth2_authenticator.retrieve_new_token() + self.cfg_parser['DEFAULT']['user_token'] = token + token_manager = OAuth2TokenManager(self.cfg_parser) + self.authenticated = True self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), user_agent=socket.gethostname(), - ) + token_manager=token_manager) else: self.authenticated = False self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), @@ -92,10 +106,15 @@ class RedditDownloader: def _load_config(self): self.cfg_parser = configparser.ConfigParser() - if self.args.use_local_config and Path('./config.cfg').exists(): - self.cfg_parser.read(Path('./config.cfg')) - else: - self.cfg_parser.read(Path('./default_config.cfg').resolve()) + possible_paths = [Path('./config.cfg'), + Path(self.config_directory, 'config.cfg'), + Path('./default_config.cfg'), + ] + for path in possible_paths: + if path.resolve().expanduser().exists(): + self.config_location = path + break + self.cfg_parser.read(self.config_location) def _create_file_logger(self): main_logger = logging.getLogger() From 862121cac6a07fc44ae5209c8cf3de28d3e3b6e3 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:46:32 +1000 Subject: [PATCH 076/276] Fix OAuth2 --- bulkredditdownloader/downloader.py | 5 ++++- bulkredditdownloader/oauth2.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index d32572d..1fd3bb7 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -72,7 +72,10 @@ class RedditDownloader: if not self.cfg_parser.has_option('DEFAULT', 'user_token'): scopes = self.cfg_parser.get('DEFAULT', 'scopes') scopes = OAuth2Authenticator.split_scopes(scopes) - oauth2_authenticator = OAuth2Authenticator(scopes) + oauth2_authenticator = OAuth2Authenticator( + scopes, + self.cfg_parser.get('DEFAULT', 'client_id'), + self.cfg_parser.get('DEFAULT', 'client_secret')) token = oauth2_authenticator.retrieve_new_token() self.cfg_parser['DEFAULT']['user_token'] = token token_manager = OAuth2TokenManager(self.cfg_parser) diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py index 7945c1f..30a9d0e 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bulkredditdownloader/oauth2.py @@ -17,9 +17,11 @@ logger = logging.getLogger(__name__) class OAuth2Authenticator: - def __init__(self, wanted_scopes: set[str]): + def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str): self._check_scopes(wanted_scopes) self.scopes = wanted_scopes + self.client_id = client_id + self.client_secret = client_secret @staticmethod def _check_scopes(wanted_scopes: set[str]): @@ -37,7 +39,11 @@ class OAuth2Authenticator: return set(scopes) def retrieve_new_token(self) -> str: - reddit = praw.Reddit(redirect_uri='http://localhost:8080', user_agent='obtain_refresh_token for BDFR') + reddit = praw.Reddit( + redirect_uri='http://localhost:7634', + user_agent='obtain_refresh_token for BDFR', + client_id=self.client_id, + client_secret=self.client_secret) state = str(random.randint(0, 65000)) url = reddit.auth.url(self.scopes, state, 'permanent') logger.warning('Authentication action required before the program can proceed') @@ -62,7 +68,7 @@ class OAuth2Authenticator: def receive_connection() -> socket.socket: server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(('localhost', 8080)) + server.bind(('localhost', 7634)) logger.debug('Server listening on localhost:8080') server.listen(1) From 735833503fd7e42dbd9f33bc42a9150921a42498 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 8 Mar 2021 12:52:53 +1000 Subject: [PATCH 077/276] Fix wrong if condition --- bulkredditdownloader/downloader.py | 39 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 1fd3bb7..aeb88ab 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -181,26 +181,27 @@ class RedditDownloader: return [] def _get_user_data(self) -> list[Iterator]: - if self.args.user: - if not self._check_user_existence(self.args.user): - raise errors.RedditUserError(f'User {self.args.user} does not exist') - generators = [] - sort_function = self._determine_sort_function() - if self.args.submitted: - generators.append( - sort_function( - self.reddit_instance.redditor(self.args.user).submissions, - limit=self.args.limit)) - if not self.authenticated and any((self.args.upvoted, self.args.saved)): - raise errors.RedditAuthenticationError('Accessing user lists requires authentication') + if any([self.args.submitted, self.args.upvoted, self.args.saved]): + if self.args.user: + if not self._check_user_existence(self.args.user): + raise errors.RedditUserError(f'User {self.args.user} does not exist') + generators = [] + sort_function = self._determine_sort_function() + if self.args.submitted: + generators.append( + sort_function( + self.reddit_instance.redditor(self.args.user).submissions, + limit=self.args.limit)) + if not self.authenticated and any((self.args.upvoted, self.args.saved)): + raise errors.RedditAuthenticationError('Accessing user lists requires authentication') + else: + if self.args.upvoted: + generators.append(self.reddit_instance.redditor(self.args.user).upvoted) + if self.args.saved: + generators.append(self.reddit_instance.redditor(self.args.user).saved) + return generators else: - if self.args.upvoted: - generators.append(self.reddit_instance.redditor(self.args.user).upvoted) - if self.args.saved: - generators.append(self.reddit_instance.redditor(self.args.user).saved) - return generators - else: - raise errors.BulkDownloaderException('A user must be supplied to download user data') + raise errors.BulkDownloaderException('A user must be supplied to download user data') def _check_user_existence(self, name: str) -> bool: user = self.reddit_instance.redditor(name=name) From 7d30af3559e9364753e208f934afc555a9a4c69b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 15:51:06 +1000 Subject: [PATCH 078/276] Add empty authenticator --- bulkredditdownloader/downloader.py | 6 +++--- bulkredditdownloader/site_authenticator.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index aeb88ab..8ddd00a 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -57,14 +57,14 @@ class RedditDownloader: self.time_filter = self._create_time_filter() self.sort_filter = self._create_sort_filter() self.file_name_formatter = self._create_file_name_formatter() - # self.authenticator = self._create_authenticator() self._resolve_user_name() self._determine_directories() self._create_file_logger() - self.master_hash_list = [] self._load_config() + self.master_hash_list = [] + self.authenticator = self._create_authenticator() self._create_reddit_instance() def _create_reddit_instance(self): @@ -231,7 +231,7 @@ class RedditDownloader: return DownloadFilter(self.args.skip, self.args.skip_domain) def _create_authenticator(self) -> SiteAuthenticator: - raise NotImplementedError + return SiteAuthenticator(self.cfg_parser) def download(self): for generator in self.reddit_lists: diff --git a/bulkredditdownloader/site_authenticator.py b/bulkredditdownloader/site_authenticator.py index 93cebf7..bbf3b46 100644 --- a/bulkredditdownloader/site_authenticator.py +++ b/bulkredditdownloader/site_authenticator.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 # coding=utf-8 +import configparser + class SiteAuthenticator: - def __init__(self): + def __init__(self, cfg: configparser.ConfigParser): self.imgur_authentication = None From db6c64d0abb852ac9e7759aa41b31509a7973319 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 16:05:09 +1000 Subject: [PATCH 079/276] Fix some tests --- bulkredditdownloader/tests/test_downloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 5c49531..f986bfe 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -277,6 +277,7 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic @pytest.mark.online @pytest.mark.reddit def test_get_user_no_user(downloader_mock: MagicMock): + downloader_mock.args.upvoted = True with pytest.raises(BulkDownloaderException): RedditDownloader._get_user_data(downloader_mock) @@ -290,6 +291,7 @@ def test_get_user_no_user(downloader_mock: MagicMock): def test_get_user_nonexistent_user(test_user: str, downloader_mock: MagicMock, reddit_instance: praw.Reddit): downloader_mock.reddit_instance = reddit_instance downloader_mock.args.user = test_user + downloader_mock.args.upvoted = True downloader_mock._check_user_existence.return_value = RedditDownloader._check_user_existence( downloader_mock, test_user) with pytest.raises(RedditUserError): From 326eb484cc894f380c3d668a400193cc2cfb9138 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 19:19:14 +1000 Subject: [PATCH 080/276] Add file to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a81c8ee..7207598 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,6 @@ dmypy.json # Cython debug symbols cython_debug/ + +# Test configuration file +test_config.cfg From afe618916bc772fa12398fbbc3090837d71ea911 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 19:20:15 +1000 Subject: [PATCH 081/276] Make sure refresh token is always written to file --- bulkredditdownloader/downloader.py | 2 +- bulkredditdownloader/oauth2.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 8ddd00a..efc1aef 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -78,7 +78,7 @@ class RedditDownloader: self.cfg_parser.get('DEFAULT', 'client_secret')) token = oauth2_authenticator.retrieve_new_token() self.cfg_parser['DEFAULT']['user_token'] = token - token_manager = OAuth2TokenManager(self.cfg_parser) + token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location) self.authenticated = True self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py index 30a9d0e..21b846b 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bulkredditdownloader/oauth2.py @@ -6,6 +6,7 @@ import logging import random import re import socket +from pathlib import Path import praw import requests @@ -85,9 +86,10 @@ class OAuth2Authenticator: class OAuth2TokenManager(praw.reddit.BaseTokenManager): - def __init__(self, config: configparser.ConfigParser): + def __init__(self, config: configparser.ConfigParser, config_location: Path): super(OAuth2TokenManager, self).__init__() self.config = config + self.config_location = config_location def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer): if authorizer.refresh_token is None: @@ -98,3 +100,5 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager): def post_refresh_callback(self, authorizer: praw.reddit.Authorizer): self.config.set('DEFAULT', 'user_token', authorizer.refresh_token) + with open(self.config_location, 'w') as file: + self.config.write(file, True) From 933be21392b6d7306fa1be4d8ebd22d3554a3f55 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 19:32:51 +1000 Subject: [PATCH 082/276] Update tests --- bulkredditdownloader/tests/test_oauth2.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bulkredditdownloader/tests/test_oauth2.py b/bulkredditdownloader/tests/test_oauth2.py index c5a63b7..ced7605 100644 --- a/bulkredditdownloader/tests/test_oauth2.py +++ b/bulkredditdownloader/tests/test_oauth2.py @@ -2,6 +2,7 @@ # coding=utf-8 import configparser +from pathlib import Path from unittest.mock import MagicMock import pytest @@ -42,10 +43,10 @@ def test_split_scopes(test_scopes: str, expected: set[str]): @pytest.mark.online @pytest.mark.parametrize('test_scopes', ( - ('random',), - ('scope', 'another_scope'), + {'random', }, + {'scope', 'another_scope'}, )) -def test_check_scopes_bad(test_scopes: list[str]): +def test_check_scopes_bad(test_scopes: set[str]): with pytest.raises(BulkDownloaderException): OAuth2Authenticator._check_scopes(test_scopes) @@ -53,14 +54,18 @@ def test_check_scopes_bad(test_scopes: list[str]): def test_token_manager_read(example_config: configparser.ConfigParser): mock_authoriser = MagicMock() mock_authoriser.refresh_token = None - test_manager = OAuth2TokenManager(example_config) + test_manager = OAuth2TokenManager(example_config, None) test_manager.pre_refresh_callback(mock_authoriser) assert mock_authoriser.refresh_token == example_config.get('DEFAULT', 'user_token') -def test_token_manager_write(example_config: configparser.ConfigParser): +def test_token_manager_write(example_config: configparser.ConfigParser, tmp_path: Path): + test_path = tmp_path / 'test.cfg' mock_authoriser = MagicMock() mock_authoriser.refresh_token = 'changed_token' - test_manager = OAuth2TokenManager(example_config) + test_manager = OAuth2TokenManager(example_config, test_path) test_manager.post_refresh_callback(mock_authoriser) assert example_config.get('DEFAULT', 'user_token') == 'changed_token' + with open(test_path, 'r') as file: + file_contents = file.read() + assert 'user_token = changed_token' in file_contents From d8a767e8dab5f34a6a7cd2c6f69b1469ead2aa1c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 19:36:12 +1000 Subject: [PATCH 083/276] Fix option default --- bulkredditdownloader/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index e44b89d..84780f9 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -23,6 +23,7 @@ def _add_options(): parser.add_argument('--link', '-l', help='Get posts from link', action='append', + default=[], metavar='link') parser.add_argument('--submitted', action='store_true', From 1f62a7ccd6c4362d22287709541a2dff592a793e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 19:36:32 +1000 Subject: [PATCH 084/276] Fix function return --- bulkredditdownloader/downloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index efc1aef..3a24b2b 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -202,6 +202,8 @@ class RedditDownloader: return generators else: raise errors.BulkDownloaderException('A user must be supplied to download user data') + else: + return [] def _check_user_existence(self, name: str) -> bool: user = self.reddit_instance.redditor(name=name) From cfd92a8d146c56fd28d07fd0c0591263ed2872c9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 19:45:26 +1000 Subject: [PATCH 085/276] Add some logging output --- bulkredditdownloader/downloader.py | 29 +++++++++++++++++++++++------ bulkredditdownloader/oauth2.py | 4 +++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 3a24b2b..c8766d5 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -53,23 +53,32 @@ class RedditDownloader: self.reddit_lists = self._retrieve_reddit_lists() def _setup_internal_objects(self): - self.download_filter = self._create_download_filter() - self.time_filter = self._create_time_filter() - self.sort_filter = self._create_sort_filter() - self.file_name_formatter = self._create_file_name_formatter() - - self._resolve_user_name() self._determine_directories() self._create_file_logger() + + self.download_filter = self._create_download_filter() + logger.debug('Created download filter') + self.time_filter = self._create_time_filter() + logger.debug('Created time filter') + self.sort_filter = self._create_sort_filter() + logger.debug('Created sort filter') + self.file_name_formatter = self._create_file_name_formatter() + logger.debug('Create file name formatter') + + self._resolve_user_name() self._load_config() + logger.debug(f'Configuration loaded from {self.config_location}') self.master_hash_list = [] self.authenticator = self._create_authenticator() + logger.debug('Created site authenticator') self._create_reddit_instance() def _create_reddit_instance(self): if self.args.authenticate: + logger.debug('Using authenticated Reddit instance') if not self.cfg_parser.has_option('DEFAULT', 'user_token'): + logger.debug('Commencing OAuth2 authentication') scopes = self.cfg_parser.get('DEFAULT', 'scopes') scopes = OAuth2Authenticator.split_scopes(scopes) oauth2_authenticator = OAuth2Authenticator( @@ -86,6 +95,7 @@ class RedditDownloader: user_agent=socket.gethostname(), token_manager=token_manager) else: + logger.debug('Using unauthenticated Reddit instance') self.authenticated = False self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), @@ -94,9 +104,13 @@ class RedditDownloader: def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: master_list = [] master_list.extend(self._get_subreddits()) + logger.debug('Retrieved subreddits') master_list.extend(self._get_multireddits()) + logger.debug('Retrieved multireddits') master_list.extend(self._get_user_data()) + logger.debug('Retrieved user data') master_list.extend(self._get_submissions_from_link()) + logger.debug('Retrieved submissions for given links') return master_list def _determine_directories(self): @@ -113,10 +127,13 @@ class RedditDownloader: Path(self.config_directory, 'config.cfg'), Path('./default_config.cfg'), ] + self.config_location = None for path in possible_paths: if path.resolve().expanduser().exists(): self.config_location = path break + if not self.config_location: + raise errors.BulkDownloaderException('Could not find a configuration file to load') self.cfg_parser.read(self.config_location) def _create_file_logger(self): diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py index 21b846b..a29d907 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bulkredditdownloader/oauth2.py @@ -70,7 +70,7 @@ class OAuth2Authenticator: server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server.bind(('localhost', 7634)) - logger.debug('Server listening on localhost:8080') + logger.debug('Server listening on localhost:7634') server.listen(1) client = server.accept()[0] @@ -95,6 +95,7 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager): if authorizer.refresh_token is None: if self.config.has_option('DEFAULT', 'user_token'): authorizer.refresh_token = self.config.get('DEFAULT', 'user_token') + logger.debug('Loaded OAuth2 token for authoriser') else: raise RedditAuthenticationError('No auth token loaded in configuration') @@ -102,3 +103,4 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager): self.config.set('DEFAULT', 'user_token', authorizer.refresh_token) with open(self.config_location, 'w') as file: self.config.write(file, True) + logger.debug(f'Written OAuth2 token from authoriser to {self.config_location}') From 14b63487bc8b22867489b43ca48d09f7c57a4734 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 9 Mar 2021 19:51:48 +1000 Subject: [PATCH 086/276] Immediately write token to disk in case it is unused --- bulkredditdownloader/downloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index c8766d5..4793433 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -87,6 +87,8 @@ class RedditDownloader: self.cfg_parser.get('DEFAULT', 'client_secret')) token = oauth2_authenticator.retrieve_new_token() self.cfg_parser['DEFAULT']['user_token'] = token + with open(self.config_location, 'w') as file: + self.cfg_parser.write(file, True) token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location) self.authenticated = True From f138b9210e7b6cb3d8b6379202c42523ce5f66ac Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 08:43:15 +1000 Subject: [PATCH 087/276] Add another mark --- pytest.ini | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index ca6ae59..5123ee6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,4 +2,6 @@ markers = online: tests require a connection to the internet reddit: tests require a connection to Reddit - slow: test is slow to run \ No newline at end of file + slow: test is slow to run + authenticated: test requires an authenticated Reddit instance + From d2cc3e8b6ae042900e64da68e964fa867d076a53 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 09:41:46 +1000 Subject: [PATCH 088/276] Add authenticated Reddit instance fixture --- bulkredditdownloader/tests/conftest.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/bulkredditdownloader/tests/conftest.py b/bulkredditdownloader/tests/conftest.py index e1de72e..4197989 100644 --- a/bulkredditdownloader/tests/conftest.py +++ b/bulkredditdownloader/tests/conftest.py @@ -1,11 +1,34 @@ #!/usr/bin/env python3 # coding=utf-8 +import configparser +import socket +from pathlib import Path + import praw import pytest +from bulkredditdownloader.oauth2 import OAuth2TokenManager + @pytest.fixture(scope='session') def reddit_instance(): rd = praw.Reddit(client_id='U-6gk4ZCh3IeNQ', client_secret='7CZHY6AmKweZME5s50SfDGylaPg', user_agent='test') return rd + + +@pytest.fixture(scope='session') +def authenticated_reddit_instance(): + test_config_path = Path('test_config.cfg') + if not test_config_path.exists(): + pytest.skip('Refresh token must be provided to authenticate with OAuth2') + cfg_parser = configparser.ConfigParser() + cfg_parser.read(test_config_path) + if not cfg_parser.has_option('DEFAULT', 'user_token'): + pytest.skip('Refresh token must be provided to authenticate with OAuth2') + token_manager = OAuth2TokenManager(cfg_parser, test_config_path) + reddit_instance = praw.Reddit(client_id=cfg_parser.get('DEFAULT', 'client_id'), + client_secret=cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + token_manager=token_manager) + return reddit_instance From 09e42ff5f92463c1ec555149ec11049296b40aa9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 11:06:50 +1000 Subject: [PATCH 089/276] Add some more tests for downloader --- bulkredditdownloader/downloader.py | 6 +-- bulkredditdownloader/tests/test_downloader.py | 46 ++++++++++++++----- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 4793433..8f13dd4 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -164,7 +164,7 @@ class RedditDownloader: def _resolve_user_name(self): if self.args.user == 'me': - self.args.user = self.reddit_instance.user.me() + self.args.user = self.reddit_instance.user.me().name def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] @@ -215,9 +215,9 @@ class RedditDownloader: raise errors.RedditAuthenticationError('Accessing user lists requires authentication') else: if self.args.upvoted: - generators.append(self.reddit_instance.redditor(self.args.user).upvoted) + generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit)) if self.args.saved: - generators.append(self.reddit_instance.redditor(self.args.user).saved) + generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit)) return generators else: raise errors.BulkDownloaderException('A user must be supplied to download user data') diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index f986bfe..6531ce7 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -300,30 +300,52 @@ def test_get_user_nonexistent_user(test_user: str, downloader_mock: MagicMock, r @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_get_user_upvoted(): - raise NotImplementedError +@pytest.mark.authenticated +def test_get_user_upvoted(downloader_mock: MagicMock, authenticated_reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = authenticated_reddit_instance + downloader_mock.args.user = 'me' + downloader_mock.args.upvoted = True + downloader_mock.args.limit = 10 + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + RedditDownloader._resolve_user_name(downloader_mock) + results = RedditDownloader._get_user_data(downloader_mock) + assert_all_results_are_submissions(10, results) @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_get_user_upvoted_unauthenticated(): - raise NotImplementedError +def test_get_user_upvoted_unauthenticated(downloader_mock: MagicMock, reddit_instance: praw.Reddit): + downloader_mock.args.user = 'random' + downloader_mock.args.upvoted = True + downloader_mock.authenticated = False + with pytest.raises(RedditAuthenticationError): + RedditDownloader._get_user_data(downloader_mock) @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_get_user_saved(): - raise NotImplementedError +@pytest.mark.authenticated +def test_get_user_saved(downloader_mock: MagicMock, authenticated_reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = authenticated_reddit_instance + downloader_mock.args.user = 'me' + downloader_mock.args.saved = True + downloader_mock.args.limit = 10 + downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock.sort_filter = RedditTypes.SortType.HOT + RedditDownloader._resolve_user_name(downloader_mock) + results = RedditDownloader._get_user_data(downloader_mock) + assert_all_results_are_submissions(10, results) @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_get_user_saved_unauthenticated(): - raise NotImplementedError +def test_get_user_saved_unauthenticated(downloader_mock: MagicMock, reddit_instance: praw.Reddit): + downloader_mock.args.user = 'random' + downloader_mock.args.saved = True + downloader_mock.authenticated = False + with pytest.raises(RedditAuthenticationError): + RedditDownloader._get_user_data(downloader_mock) @pytest.mark.online From a4716292877c792dcd59d792ed438a761608bfbd Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 11:08:24 +1000 Subject: [PATCH 090/276] Convert some strings to f-strings --- bulkredditdownloader/downloader.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 8f13dd4..bbc6e5c 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -261,33 +261,33 @@ class RedditDownloader: def _download_submission(self, submission: praw.models.Submission): if self.download_filter.check_url(submission.url): - logger.debug('Attempting to download submission {}'.format(submission.id)) + logger.debug(f'Attempting to download submission {submission.id}') try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) except errors.NotADownloadableLinkError as e: - logger.error('Could not download submission {}: {}'.format(submission.name, e)) + logger.error(f'Could not download submission {submission.name}: {e}') return if self.args.no_download: - logger.info('Skipping download for submission {}'.format(submission.id)) + logger.info(f'Skipping download for submission {submission.id}') else: content = downloader.find_resources(self.authenticator) for res in content: destination = self.file_name_formatter.format_path(res, self.download_directory) if destination.exists(): - logger.debug('File already exists: {}'.format(destination)) + logger.debug(f'File already exists: {destination}') else: if res.hash.hexdigest() not in self.master_hash_list and not self.args.no_dupes: # TODO: consider making a hard link/symlink here destination.parent.mkdir(parents=True, exist_ok=True) with open(destination, 'wb') as file: file.write(res.content) - logger.debug('Written file to {}'.format(destination)) + logger.debug(f'Written file to {destination}') self.master_hash_list.append(res.hash.hexdigest()) - logger.debug('Hash added to master list: {}'.format(res.hash.hexdigest())) + logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') else: logger.debug(f'Resource from {res.url} downloaded elsewhere') - logger.info('Downloaded submission {}'.format(submission.name)) + logger.info(f'Downloaded submission {submission.name}') From 839b6f50a980c15ffcb43952d17f24a293e74a51 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 14:07:59 +1000 Subject: [PATCH 091/276] Update regex for finding extension --- bulkredditdownloader/resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index ea6a692..30cbd3d 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -51,7 +51,7 @@ class Resource: self.hash = hashlib.md5(self.content) def _determine_extension(self) -> str: - extension_pattern = r'.*(\..{3,5})$' + extension_pattern = re.compile(r'.*(\..{3,5})(?:\?.*)?$') match = re.search(extension_pattern, self.url) if match: return match.group(1) From 25f70463f3c50049400982c22d2967b14c8035f7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 14:08:42 +1000 Subject: [PATCH 092/276] Allow for single root folder --- bulkredditdownloader/file_name_formatter.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 9cd20fe..3fe4c60 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -17,8 +17,6 @@ class FileNameFormatter: if not self.validate_string(file_format_string): raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string') self.file_format_string = file_format_string - if not self.validate_string(directory_format_string): - raise BulkDownloaderException(f'"{directory_format_string}" is not a valid format string') self.directory_format_string = directory_format_string @staticmethod From eca5da7f46812f8f8e3ad6111ceb2a98579c2a86 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 14:09:27 +1000 Subject: [PATCH 093/276] Add test for resource --- bulkredditdownloader/tests/test_resource.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/tests/test_resource.py b/bulkredditdownloader/tests/test_resource.py index 3f9b976..d7b3898 100644 --- a/bulkredditdownloader/tests/test_resource.py +++ b/bulkredditdownloader/tests/test_resource.py @@ -13,6 +13,7 @@ from bulkredditdownloader.resource import Resource ('http://www.random.com/resource.png', '.png'), ('https://www.resource.com/test/example.jpg', '.jpg'), ('hard.png.mp4', '.mp4'), + ('https://preview.redd.it/7zkmr1wqqih61.png?width=237&format=png&auto=webp&s=19de214e634cbcad99', '.png'), )) def test_resource_get_extension(test_url: str, expected: str): test_resource = Resource(None, test_url) From d78c4ca78ebc846d6c5a3beecc8d838a8422b69c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 14:39:01 +1000 Subject: [PATCH 094/276] Add indexing for multiple resources from one submission --- bulkredditdownloader/file_name_formatter.py | 15 ++++-- .../tests/test_file_name_formatter.py | 46 +++++++++++++++++-- 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 3fe4c60..3575b00 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -3,6 +3,7 @@ import re from pathlib import Path +from typing import Optional import praw.models @@ -38,13 +39,21 @@ class FileNameFormatter: result = result.replace('/', '') return result - def format_path(self, resource: Resource, destination_directory: Path) -> Path: + def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) + index = f'_{str(index)}' if index else '' file_path = subfolder / (str(self._format_name(resource.source_submission, - self.file_format_string)) + resource.extension) + self.file_format_string)) + index + resource.extension) return file_path - @staticmethod + def format_resource_paths(self, resources: list[Resource], + destination_directory: Path) -> list[tuple[Path, Resource]]: + out = [] + for i, res in enumerate(resources, start=1): + out.append((self._format_path(res, destination_directory, i), res)) + return out + + @ staticmethod def validate_string(test_string: str) -> bool: if not test_string: return False diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index eb679d3..533d570 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -2,7 +2,8 @@ # coding=utf-8 from pathlib import Path -from unittest.mock import Mock +from typing import Optional +from unittest.mock import MagicMock import praw.models import pytest @@ -12,8 +13,8 @@ from bulkredditdownloader.resource import Resource @pytest.fixture() -def submission() -> Mock: - test = Mock() +def submission() -> MagicMock: + test = MagicMock() test.title = 'name' test.subreddit.display_name = 'randomreddit' test.author.name = 'person' @@ -37,7 +38,7 @@ def reddit_submission(reddit_instance) -> praw.models.Submission: ('{DATE}', '123456789'), ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345') )) -def test_format_name_mock(format_string: str, expected: str, submission: Mock): +def test_format_name_mock(format_string: str, expected: str, submission: MagicMock): result = FileNameFormatter._format_name(submission, format_string) assert result == expected @@ -87,5 +88,40 @@ def test_format_full( reddit_submission: praw.models.Submission): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory) - result = test_formatter.format_path(test_resource, Path('test')) + result = test_formatter._format_path(test_resource, Path('test')) assert str(result) == expected + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'index', 'expected'), + (('{SUBREDDIT}', '{POSTID}', None, 'test/Mindustry/lgilgt.png'), + ('{SUBREDDIT}', '{POSTID}', 1, 'test/Mindustry/lgilgt_1.png'), + ('{SUBREDDIT}', '{POSTID}', 2, 'test/Mindustry/lgilgt_2.png'), + ('{SUBREDDIT}', '{TITLE}_{POSTID}', 2, + 'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt_2.png'), + )) +def test_format_full_with_index_suffix( + format_string_directory: str, + format_string_file: str, + index: Optional[int], + expected: str, + reddit_submission: praw.models.Submission): + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_formatter = FileNameFormatter(format_string_file, format_string_directory) + result = test_formatter._format_path(test_resource, Path('test'), index) + assert str(result) == expected + + +def test_format_multiple_resources(): + mocks = [] + for i in range(1, 5): + new_mock = MagicMock() + new_mock.url = 'https://example.com/test.png' + new_mock.extension = '.png' + new_mock.source_submission.title = 'test' + mocks.append(new_mock) + test_formatter = FileNameFormatter('{TITLE}', '') + results = test_formatter.format_resource_paths(mocks, Path('.')) + results = set([str(res[0]) for res in results]) + assert results == {'test_1.png', 'test_2.png', 'test_3.png', 'test_4.png'} From 1046bcdf69f894634e822204153346baa0ac2228 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 16:50:19 +1000 Subject: [PATCH 095/276] Add last few tests for RedditDownloader --- bulkredditdownloader/downloader.py | 36 +++++------ bulkredditdownloader/tests/test_downloader.py | 64 +++++++++++++++---- 2 files changed, 69 insertions(+), 31 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index bbc6e5c..b6b4044 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -270,24 +270,20 @@ class RedditDownloader: logger.error(f'Could not download submission {submission.name}: {e}') return - if self.args.no_download: - logger.info(f'Skipping download for submission {submission.id}') - else: - content = downloader.find_resources(self.authenticator) - for res in content: - destination = self.file_name_formatter.format_path(res, self.download_directory) - if destination.exists(): - logger.debug(f'File already exists: {destination}') + content = downloader.find_resources(self.authenticator) + for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): + if destination.exists(): + logger.debug(f'File already exists: {destination}') + else: + res.download() + if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: + logger.debug(f'Resource from {res.url} downloaded elsewhere') else: - if res.hash.hexdigest() not in self.master_hash_list and not self.args.no_dupes: - # TODO: consider making a hard link/symlink here - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug(f'Written file to {destination}') - self.master_hash_list.append(res.hash.hexdigest()) - logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') - else: - logger.debug(f'Resource from {res.url} downloaded elsewhere') - - logger.info(f'Downloaded submission {submission.name}') + # TODO: consider making a hard link/symlink here + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug(f'Written file to {destination}') + self.master_hash_list.append(res.hash.hexdigest()) + logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') + logger.info(f'Downloaded submission {submission.name}') diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 6531ce7..42233d1 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -2,6 +2,7 @@ # coding=utf-8 import argparse +import re from pathlib import Path from typing import Iterator from unittest.mock import MagicMock @@ -10,6 +11,7 @@ import praw import praw.models import pytest +from bulkredditdownloader.__main__ import _setup_logging from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.downloader import RedditDownloader, RedditTypes from bulkredditdownloader.exceptions import BulkDownloaderException, RedditAuthenticationError, RedditUserError @@ -114,6 +116,8 @@ def test_create_sort_filter(test_sort: str, expected: str, downloader_mock: Magi @pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( ('{POSTID}', '{SUBREDDIT}'), ('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}'), + ('{POSTID}', 'test'), + ('{POSTID}', ''), )) def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): downloader_mock.args.set_file_scheme = test_file_scheme @@ -127,10 +131,8 @@ def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: s @pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( ('', ''), - ('{POSTID}', ''), ('', '{SUBREDDIT}'), ('test', '{SUBREDDIT}'), - ('{POSTID}', 'test'), )) def test_create_file_name_formatter_bad(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): downloader_mock.args.set_file_scheme = test_file_scheme @@ -350,20 +352,60 @@ def test_get_user_saved_unauthenticated(downloader_mock: MagicMock, reddit_insta @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_download_submission(): - raise NotImplementedError +def test_download_submission(downloader_mock: MagicMock, reddit_instance: praw.Reddit, tmp_path: Path): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.set_folder_scheme = '' + downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + downloader_mock.master_hash_list = [] + submission = downloader_mock.reddit_instance.submission(id='ljyy27') + RedditDownloader._download_submission(downloader_mock, submission) + folder_contents = list(tmp_path.iterdir()) + assert len(folder_contents) == 4 @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_download_submission_file_exists(): - raise NotImplementedError +def test_download_submission_file_exists( + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture): + _setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.set_folder_scheme = '' + downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + downloader_mock.master_hash_list = [] + submission = downloader_mock.reddit_instance.submission(id='m1hqw6') + Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6_1.png').touch() + RedditDownloader._download_submission(downloader_mock, submission) + folder_contents = list(tmp_path.iterdir()) + output = capsys.readouterr() + assert len(folder_contents) == 1 + assert 'File already exists: ' in output.out @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skip -def test_download_submission_hash_exists(): - raise NotImplementedError +def test_download_submission_hash_exists( + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path, + capsys: pytest.CaptureFixture): + _setup_logging(3) + downloader_mock.reddit_instance = reddit_instance + downloader_mock.download_filter.check_url.return_value = True + downloader_mock.args.set_folder_scheme = '' + downloader_mock.args.no_dupes = True + downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) + downloader_mock.download_directory = tmp_path + downloader_mock.master_hash_list = ['a912af8905ae468e0121e9940f797ad7'] + submission = downloader_mock.reddit_instance.submission(id='m1hqw6') + RedditDownloader._download_submission(downloader_mock, submission) + folder_contents = list(tmp_path.iterdir()) + output = capsys.readouterr() + assert len(folder_contents) == 0 + assert re.search(r'Resource from .*? downloaded elsewhere', output.out) From 48233fad3625166c9c65e2d129fb12cc417a7dd7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 21:10:20 +1000 Subject: [PATCH 096/276] Update tests with new valid links --- .../tests/downloaders/test_redgifs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_redgifs.py b/bulkredditdownloader/tests/downloaders/test_redgifs.py index a3fbef4..4c330d1 100644 --- a/bulkredditdownloader/tests/downloaders/test_redgifs.py +++ b/bulkredditdownloader/tests/downloaders/test_redgifs.py @@ -11,10 +11,10 @@ from bulkredditdownloader.site_downloaders.redgifs import Redgifs @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected'), ( - ('https://www.redgifs.com/watch/forcefulenchantedanaconda', - 'https://thumbs2.redgifs.com/ForcefulEnchantedAnaconda.mp4'), - ('https://www.redgifs.com/watch/ficklelightirishsetter', - 'https://thumbs2.redgifs.com/FickleLightIrishsetter.mp4'), + ('https://redgifs.com/watch/frighteningvictorioussalamander', + 'https://thumbs2.redgifs.com/FrighteningVictoriousSalamander.mp4'), + ('https://redgifs.com/watch/springgreendecisivetaruca', + 'https://thumbs2.redgifs.com/SpringgreenDecisiveTaruca.mp4'), )) def test_get_link(test_url: str, expected: str): result = Redgifs._get_link(test_url) @@ -23,8 +23,8 @@ def test_get_link(test_url: str, expected: str): @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.redgifs.com/watch/forcefulenchantedanaconda', '75a23fff6ddec5de3b61d53db1f265a4'), - ('https://www.redgifs.com/watch/ficklelightirishsetter', 'd0ea030883c9a3a6a2991f5aa61369e7'), + ('https://redgifs.com/watch/frighteningvictorioussalamander', '4007c35d9e1f4b67091b5f12cffda00a'), + ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), )) def test_download_resource(test_url: str, expected_hash: str): mock_submission = Mock From 92386000d87c5241debaa11dd8935be84bab8d35 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 21:47:46 +1000 Subject: [PATCH 097/276] Implement missed test --- bulkredditdownloader/tests/test_downloader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 42233d1..ed26632 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -141,7 +141,6 @@ def test_create_file_name_formatter_bad(test_file_scheme: str, test_folder_schem RedditDownloader._create_file_name_formatter(downloader_mock) -@pytest.mark.skip def test_create_authenticator(downloader_mock: MagicMock): result = RedditDownloader._create_authenticator(downloader_mock) assert isinstance(result, SiteAuthenticator) From 271c0b989d49a2a293519b70aa672300c9cb764a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 21:47:57 +1000 Subject: [PATCH 098/276] Abstract configuration into class --- bulkredditdownloader/__main__.py | 6 ++-- bulkredditdownloader/configuration.py | 28 +++++++++++++++++++ bulkredditdownloader/downloader.py | 5 ++-- bulkredditdownloader/tests/test_downloader.py | 25 ++--------------- 4 files changed, 37 insertions(+), 27 deletions(-) create mode 100644 bulkredditdownloader/configuration.py diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 84780f9..233d710 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -4,6 +4,7 @@ import argparse import logging import sys +from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.downloader import RedditDownloader from bulkredditdownloader.exceptions import BulkDownloaderException @@ -116,7 +117,7 @@ def _setup_logging(verbosity: int): logging.getLogger('urllib3').setLevel(logging.CRITICAL) -def main(args: argparse.Namespace): +def main(args: Configuration): _setup_logging(args.verbose) try: reddit_downloader = RedditDownloader(args) @@ -127,5 +128,6 @@ def main(args: argparse.Namespace): if __name__ == '__main__': _add_options() - args = parser.parse_args() + read_configuration = Configuration() + args = parser.parse_args(namespace=read_configuration) main(args) diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py new file mode 100644 index 0000000..6b35e36 --- /dev/null +++ b/bulkredditdownloader/configuration.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from argparse import Namespace +from typing import Optional + + +class Configuration(Namespace): + def __init__(self): + super(Configuration, self).__init__() + self.directory: str = '.' + self.limit: Optional[int] = None + self.link: list[str] = [] + self.multireddit: list[str] = [] + self.no_dupes: bool = False + self.saved: bool = False + self.search: Optional[str] = None + self.set_file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' + self.set_folder_scheme: str = '{SUBREDDIT}' + self.skip: list[str] = [] + self.skip_domain: list[str] = [] + self.sort: str = 'hot' + self.submitted: bool = False + self.subreddit: list[str] = [] + self.time: str = 'all' + self.upvoted: bool = False + self.user: Optional[str] = None + self.verbose: int = 0 diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index b6b4044..da80272 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -1,10 +1,8 @@ #!/usr/bin/env python3 # coding=utf-8 -import argparse import configparser import logging -import re import socket from datetime import datetime from enum import Enum, auto @@ -17,6 +15,7 @@ import praw.models import prawcore import bulkredditdownloader.exceptions as errors +from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.file_name_formatter import FileNameFormatter from bulkredditdownloader.oauth2 import OAuth2Authenticator, OAuth2TokenManager @@ -44,7 +43,7 @@ class RedditTypes: class RedditDownloader: - def __init__(self, args: argparse.Namespace): + def __init__(self, args: Configuration): self.args = args self.config_directories = appdirs.AppDirs('bulk_reddit_downloader', 'BDFR') self.run_time = datetime.now().isoformat() diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index ed26632..caec1e7 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -12,6 +12,7 @@ import praw.models import pytest from bulkredditdownloader.__main__ import _setup_logging +from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.downloader import RedditDownloader, RedditTypes from bulkredditdownloader.exceptions import BulkDownloaderException, RedditAuthenticationError, RedditUserError @@ -20,28 +21,8 @@ from bulkredditdownloader.site_authenticator import SiteAuthenticator @pytest.fixture() -def args() -> argparse.Namespace: - args = argparse.Namespace() - - args.directory = '.' - args.verbose = 0 - args.link = [] - args.submitted = False - args.upvoted = False - args.saved = False - args.subreddit = [] - args.multireddit = [] - args.user = None - args.search = None - args.sort = 'hot' - args.limit = None - args.time = 'all' - args.skip = [] - args.skip_domain = [] - args.set_folder_scheme = '{SUBREDDIT}' - args.set_file_scheme = '{REDDITOR}_{TITLE}_{POSTID}' - args.no_dupes = False - +def args() -> Configuration: + args = Configuration() return args From 27a8b497a8b700417ae2d155096ab90f58278f5b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 10 Mar 2021 22:26:38 +1000 Subject: [PATCH 099/276] Add click to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 94652df..814a67c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ appdirs bs4 +click ffmpeg-python -requests praw +requests youtube-dl \ No newline at end of file From f8347e2d3110916fe2de60bebe13c22d9d76f14b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 10:43:26 +1000 Subject: [PATCH 100/276] Add function to process click contexts for config --- bulkredditdownloader/configuration.py | 7 +++++++ .../tests/test_configuration.py | 21 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 bulkredditdownloader/tests/test_configuration.py diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 6b35e36..02d1c46 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -4,6 +4,8 @@ from argparse import Namespace from typing import Optional +import click + class Configuration(Namespace): def __init__(self): @@ -26,3 +28,8 @@ class Configuration(Namespace): self.upvoted: bool = False self.user: Optional[str] = None self.verbose: int = 0 + + def process_click_arguments(self, context: click.Context): + for arg_key in context.params.keys(): + if arg_key in vars(self) and context.params[arg_key] is not None: + vars(self)[arg_key] = context.params[arg_key] diff --git a/bulkredditdownloader/tests/test_configuration.py b/bulkredditdownloader/tests/test_configuration.py new file mode 100644 index 0000000..9905150 --- /dev/null +++ b/bulkredditdownloader/tests/test_configuration.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from unittest.mock import MagicMock + +import pytest + +from bulkredditdownloader.configuration import Configuration + + +@pytest.mark.parametrize('arg_dict', ( + {'directory': 'test_dir'}, + {'directory': 'test_dir', 'no_dupes': True}, +)) +def test_process_click_context(arg_dict: dict): + test_config = Configuration() + test_context = MagicMock() + test_context.params = arg_dict + test_config.process_click_arguments(test_context) + test_config = vars(test_config) + assert all([test_config[arg] == arg_dict[arg] for arg in arg_dict.keys()]) From dd522c18d4686b2a1767e77eceb79cfb7525c5dc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 11:20:21 +1000 Subject: [PATCH 101/276] Add missing options to Configuration --- bulkredditdownloader/configuration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 02d1c46..a467cd1 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -10,6 +10,8 @@ import click class Configuration(Namespace): def __init__(self): super(Configuration, self).__init__() + self.authenticate = False + self.config = None self.directory: str = '.' self.limit: Optional[int] = None self.link: list[str] = [] From 50531c7b3e6836883532dfdab633fb9e8a7e8cea Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 11:20:59 +1000 Subject: [PATCH 102/276] Switch from argparse to click --- bulkredditdownloader/__main__.py | 143 +++++++++---------------------- 1 file changed, 40 insertions(+), 103 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 233d710..3625c88 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -1,104 +1,54 @@ #!/usr/bin/env python3 -import argparse import logging import sys +import click + from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.downloader import RedditDownloader from bulkredditdownloader.exceptions import BulkDownloaderException logger = logging.getLogger() -parser = argparse.ArgumentParser(allow_abbrev=False, - description='This program downloads media from reddit posts') -def _add_options(): - parser.add_argument('directory', - help='Specifies the directory where posts will be downloaded to', - metavar='DIRECTORY') - parser.add_argument('--verbose', '-v', - action='count', - default=0, - ) - parser.add_argument('--link', '-l', - help='Get posts from link', - action='append', - default=[], - metavar='link') - parser.add_argument('--submitted', - action='store_true', - help='Gets posts of --user') - parser.add_argument('--saved', - action='store_true', - help='Gets upvoted posts of --user') - parser.add_argument('--upvoted', - action='store_true', - help='Gets upvoted posts of --user') - parser.add_argument('--subreddit', - nargs='+', - help='Triggers subreddit mode and takes subreddit name. use \"frontpage\" for frontpage', - metavar='SUBREDDIT', - type=str) - parser.add_argument('--multireddit', - help='Triggers multireddit mode and takes multireddit name', - metavar='MULTIREDDIT', - action='append', - type=str) - parser.add_argument('--authenticate', - action='store_true') - parser.add_argument('--user', - help='reddit username if needed. use "me" for current user', - required='--multireddit' in sys.argv or '--submitted' in sys.argv, - metavar='redditor', - default=None, - type=str) - parser.add_argument('--search', - help='Searches for given query in given subreddits', - metavar='query', - default=None, - type=str) - parser.add_argument('--sort', - help='Either hot, top, new, controversial, rising or relevance default: hot', - choices=['hot', 'top', 'new', 'controversial', 'rising', 'relevance'], - metavar='SORT TYPE', - default='hot', - type=str) - parser.add_argument('--limit', - help='default: unlimited', - metavar='Limit', - default=None, - type=int) - parser.add_argument('--time', - help='Either hour, day, week, month, year or all. default: all', - choices=['all', 'hour', 'day', 'week', 'month', 'year'], - metavar='TIME_LIMIT', - default='all', - type=str) - parser.add_argument('--skip', - nargs='+', - help='Skip posts with given type', - type=str, - default=[]) - parser.add_argument('--skip-domain', - nargs='+', - help='Skip posts with given domain', - type=str, - default=[]) - parser.add_argument('--set-folder-scheme', - action='store_true', - help='Set custom folderpath', - default='{SUBREDDIT}' - ) - parser.add_argument('--set-file-scheme', - action='store_true', - help='Set custom filename', - default='{REDDITOR}_{TITLE}_{POSTID}' - ) - parser.add_argument('--no-dupes', - action='store_true', - help='Do not download duplicate posts on different subreddits', - ) +@click.group() +def cli(): + pass + + +@cli.command('download') +@click.argument('directory', type=str) +@click.option('-v', '--verbose', default=None, count=True) +@click.option('-l', '--link', multiple=True, default=None, type=str) +@click.option('-s', '--subreddit', multiple=True, default=None, type=str) +@click.option('-m', '--multireddit', multiple=True, default=None, type=str) +@click.option('-L', '--limit', default=None, type=int) +@click.option('--authenticate', is_flag=True, default=None) +@click.option('--submitted', is_flag=True, default=None) +@click.option('--upvoted', is_flag=True, default=None) +@click.option('--saved', is_flag=True, default=None) +@click.option('--search', default=None, type=str) +@click.option('-u', '--user', type=str, default=None) +@click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None) +@click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', + 'controversial', 'rising', 'relevance')), default=None) +@click.option('--skip', default=None, multiple=True) +@click.option('--skip-domain', default=None, multiple=True) +@click.option('--set-file-scheme', default=None, type=str) +@click.option('--set-folder-scheme', default=None, type=str) +@click.option('--no-dupes', is_flag=True, default=None) +@click.option('--config', type=str, default=None) +@click.pass_context +def cli_download(context: click.Context, **_): + config = Configuration() + config.process_click_arguments(context) + _setup_logging(config.verbose) + try: + reddit_downloader = RedditDownloader(config) + reddit_downloader.download() + except BulkDownloaderException as e: + logger.critical(f'An error occured {e}') def _setup_logging(verbosity: int): @@ -111,23 +61,10 @@ def _setup_logging(verbosity: int): stream.setLevel(logging.INFO) else: stream.setLevel(logging.DEBUG) - logging.getLogger('praw').setLevel(logging.CRITICAL) logging.getLogger('prawcore').setLevel(logging.CRITICAL) logging.getLogger('urllib3').setLevel(logging.CRITICAL) -def main(args: Configuration): - _setup_logging(args.verbose) - try: - reddit_downloader = RedditDownloader(args) - reddit_downloader.download() - except BulkDownloaderException as e: - logger.critical(f'An error occured {e}') - - if __name__ == '__main__': - _add_options() - read_configuration = Configuration() - args = parser.parse_args(namespace=read_configuration) - main(args) + cli() From 3703d2b9b9c2e8243f682e02b1e1cfdc989001f0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 11:21:50 +1000 Subject: [PATCH 103/276] Load config from arguments first --- bulkredditdownloader/downloader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index da80272..06cf8e3 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -124,6 +124,13 @@ class RedditDownloader: def _load_config(self): self.cfg_parser = configparser.ConfigParser() + if self.args.config: + if (cfg_path := Path(self.args.config)).exists(): + self.cfg_parser.read(cfg_path) + self.config_location = cfg_path + return + else: + logger.error(f'Could not find config file at {self.args.config}, attempting to find elsewhere') possible_paths = [Path('./config.cfg'), Path(self.config_directory, 'config.cfg'), Path('./default_config.cfg'), From 945116063f3b5e545621b5ac06881c5ccab13b4b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 11:26:10 +1000 Subject: [PATCH 104/276] Allow for ID or URL to be provided for submissions --- bulkredditdownloader/downloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 06cf8e3..154865b 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -175,7 +175,10 @@ class RedditDownloader: def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] for sub_id in self.args.link: - supplied_submissions.append(self.reddit_instance.submission(id=sub_id)) + if len(sub_id) == 6: + supplied_submissions.append(self.reddit_instance.submission(id=sub_id)) + else: + supplied_submissions.append(self.reddit_instance.submission(url=sub_id)) return [supplied_submissions] def _determine_sort_function(self): From d3c8897f6a6b14718ef3e1eb0e06f6f1cf6e32ab Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 11:38:21 +1000 Subject: [PATCH 105/276] Fix bug where file extension is never specified for text posts --- bulkredditdownloader/site_downloaders/self_post.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index 265a321..4773c65 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -17,7 +17,7 @@ class SelfPost(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - out = Resource(self.post, self.post.url) + out = Resource(self.post, self.post.url, '.txt') out.content = self.export_to_string().encode('utf-8') out.create_hash() return [out] From f7989ca518e71f272aea0c85faa953666403c66e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 12:25:21 +1000 Subject: [PATCH 106/276] Add method to sanitise subreddit inputs --- bulkredditdownloader/downloader.py | 15 +++++++++++-- bulkredditdownloader/tests/test_downloader.py | 22 +++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 154865b..b60ed2b 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -3,6 +3,7 @@ import configparser import logging +import re import socket from datetime import datetime from enum import Enum, auto @@ -153,9 +154,18 @@ class RedditDownloader: main_logger.addHandler(file_handler) + @staticmethod + def _sanitise_subreddit_name(subreddit: str) -> str: + pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)(?:/)?$') + match = re.match(pattern, subreddit) + if not match: + raise errors.RedditAuthenticationError('') + return match.group(1) + def _get_subreddits(self) -> list[praw.models.ListingGenerator]: if self.args.subreddit: - subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in self.args.subreddit] + subreddits = [self._sanitise_subreddit_name(subreddit) for subreddit in self.args.subreddit] + subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in subreddits] if self.args.search: return [ reddit.search( @@ -197,10 +207,11 @@ class RedditDownloader: if self.authenticated: if self.args.user: sort_function = self._determine_sort_function() + multireddits = [self._sanitise_subreddit_name(multi) for multi in self.args.multireddit] return [ sort_function(self.reddit_instance.multireddit( self.args.user, - m_reddit_choice), limit=self.args.limit) for m_reddit_choice in self.args.multireddit] + m_reddit_choice), limit=self.args.limit) for m_reddit_choice in multireddits] else: raise errors.BulkDownloaderException('A user must be provided to download a multireddit') else: diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index caec1e7..2c4208f 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -160,13 +160,13 @@ def test_get_subreddit_normal( downloader_mock: MagicMock, reddit_instance: praw.Reddit): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name downloader_mock.args.limit = limit downloader_mock.args.subreddit = test_subreddits downloader_mock.reddit_instance = reddit_instance downloader_mock.sort_filter = RedditTypes.SortType.HOT results = RedditDownloader._get_subreddits(downloader_mock) - results = assert_all_results_are_submissions( - (limit * len(test_subreddits)) if limit else None, results) + results = assert_all_results_are_submissions((limit * len(test_subreddits)) if limit else None, results) assert all([res.subreddit.display_name in test_subreddits for res in results]) @@ -184,6 +184,7 @@ def test_get_subreddit_search( downloader_mock: MagicMock, reddit_instance: praw.Reddit): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name downloader_mock.args.limit = limit downloader_mock.args.search = search_term downloader_mock.args.subreddit = test_subreddits @@ -209,6 +210,7 @@ def test_get_multireddits_public( reddit_instance: praw.Reddit, downloader_mock: MagicMock): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot + downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.limit = limit downloader_mock.args.multireddit = test_multireddits @@ -389,3 +391,19 @@ def test_download_submission_hash_exists( output = capsys.readouterr() assert len(folder_contents) == 0 assert re.search(r'Resource from .*? downloaded elsewhere', output.out) + + +@pytest.mark.parametrize(('test_name', 'expected'), ( + ('Mindustry', 'Mindustry'), + ('Futurology', 'Futurology'), + ('r/Mindustry', 'Mindustry'), + ('TrollXChromosomes', 'TrollXChromosomes'), + ('r/TrollXChromosomes', 'TrollXChromosomes'), + ('https://www.reddit.com/r/TrollXChromosomes/', 'TrollXChromosomes'), + ('https://www.reddit.com/r/TrollXChromosomes', 'TrollXChromosomes'), + ('https://www.reddit.com/r/Futurology/', 'Futurology'), + ('https://www.reddit.com/r/Futurology', 'Futurology'), +)) +def test_sanitise_subreddit_name(test_name: str, expected: str): + result = RedditDownloader._sanitise_subreddit_name(test_name) + assert result == expected From 312769cb667a9890b988e0e184cda8e3070c0366 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 13:14:36 +1000 Subject: [PATCH 107/276] Remove unused file --- bulkredditdownloader/store.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 bulkredditdownloader/store.py diff --git a/bulkredditdownloader/store.py b/bulkredditdownloader/store.py deleted file mode 100644 index 5aba94e..0000000 --- a/bulkredditdownloader/store.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python3 - -from os import path - - -class Store: - def __init__(self, directory: str = None): - self.directory = directory - if self.directory: - if path.exists(directory): - with open(directory, 'r') as f: - self.list = f.read().split("\n") - else: - with open(self.directory, 'a'): - pass - self.list = [] - else: - self.list = [] - - def __call__(self) -> list: - return self.list - - def add(self, data: dict): - self.list.append(data) - if self.directory: - with open(self.directory, 'a') as f: - f.write("{data}\n".format(data=data)) From f941161014209d957be49d47e6e47bee58b26834 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 13:20:39 +1000 Subject: [PATCH 108/276] Add much more logging --- bulkredditdownloader/__main__.py | 6 ++-- bulkredditdownloader/download_filter.py | 5 ++++ bulkredditdownloader/downloader.py | 32 ++++++++++++--------- bulkredditdownloader/file_name_formatter.py | 12 ++++++-- bulkredditdownloader/oauth2.py | 8 +++--- bulkredditdownloader/resource.py | 5 ++++ 6 files changed, 47 insertions(+), 21 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 3625c88..d8d1f08 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -57,10 +57,12 @@ def _setup_logging(verbosity: int): formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') stream.setFormatter(formatter) logger.addHandler(stream) - if verbosity < 0: + if verbosity <= 0: stream.setLevel(logging.INFO) - else: + elif verbosity == 1: stream.setLevel(logging.DEBUG) + else: + stream.setLevel(9) logging.getLogger('praw').setLevel(logging.CRITICAL) logging.getLogger('prawcore').setLevel(logging.CRITICAL) logging.getLogger('urllib3').setLevel(logging.CRITICAL) diff --git a/bulkredditdownloader/download_filter.py b/bulkredditdownloader/download_filter.py index 806fd0d..37a6ce9 100644 --- a/bulkredditdownloader/download_filter.py +++ b/bulkredditdownloader/download_filter.py @@ -1,8 +1,11 @@ #!/usr/bin/env python3 # coding=utf-8 +import logging import re +logger = logging.getLogger(__name__) + class DownloadFilter: def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None): @@ -24,6 +27,7 @@ class DownloadFilter: combined_extensions = '|'.join(self.excluded_extensions) pattern = re.compile(r'.*({})$'.format(combined_extensions)) if re.match(pattern, url): + logger.log(9, f'Url "{url}" matched with "{str(pattern)}"') return False else: return True @@ -34,6 +38,7 @@ class DownloadFilter: combined_domains = '|'.join(self.excluded_domains) pattern = re.compile(r'https?://.*({}).*'.format(combined_domains)) if re.match(pattern, url): + logger.log(9, f'Url "{url}" matched with "{str(pattern)}"') return False else: return True diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index b60ed2b..5b939b0 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -57,13 +57,13 @@ class RedditDownloader: self._create_file_logger() self.download_filter = self._create_download_filter() - logger.debug('Created download filter') + logger.log(9, 'Created download filter') self.time_filter = self._create_time_filter() - logger.debug('Created time filter') + logger.log(9, 'Created time filter') self.sort_filter = self._create_sort_filter() - logger.debug('Created sort filter') + logger.log(9, 'Created sort filter') self.file_name_formatter = self._create_file_name_formatter() - logger.debug('Create file name formatter') + logger.log(9, 'Create file name formatter') self._resolve_user_name() self._load_config() @@ -71,14 +71,14 @@ class RedditDownloader: self.master_hash_list = [] self.authenticator = self._create_authenticator() - logger.debug('Created site authenticator') + logger.log(9, 'Created site authenticator') self._create_reddit_instance() def _create_reddit_instance(self): if self.args.authenticate: logger.debug('Using authenticated Reddit instance') if not self.cfg_parser.has_option('DEFAULT', 'user_token'): - logger.debug('Commencing OAuth2 authentication') + logger.log(9, 'Commencing OAuth2 authentication') scopes = self.cfg_parser.get('DEFAULT', 'scopes') scopes = OAuth2Authenticator.split_scopes(scopes) oauth2_authenticator = OAuth2Authenticator( @@ -106,13 +106,13 @@ class RedditDownloader: def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: master_list = [] master_list.extend(self._get_subreddits()) - logger.debug('Retrieved subreddits') + logger.log(9, 'Retrieved subreddits') master_list.extend(self._get_multireddits()) - logger.debug('Retrieved multireddits') + logger.log(9, 'Retrieved multireddits') master_list.extend(self._get_user_data()) - logger.debug('Retrieved user data') + logger.log(9, 'Retrieved user data') master_list.extend(self._get_submissions_from_link()) - logger.debug('Retrieved submissions for given links') + logger.log(9, 'Retrieved submissions for given links') return master_list def _determine_directories(self): @@ -140,6 +140,7 @@ class RedditDownloader: for path in possible_paths: if path.resolve().expanduser().exists(): self.config_location = path + logger.debug(f'Loading configuration from {path}') break if not self.config_location: raise errors.BulkDownloaderException('Could not find a configuration file to load') @@ -181,6 +182,7 @@ class RedditDownloader: def _resolve_user_name(self): if self.args.user == 'me': self.args.user = self.reddit_instance.user.me().name + logger.log(9, f'Resolved user to {self.args.user}') def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] @@ -227,6 +229,7 @@ class RedditDownloader: generators = [] sort_function = self._determine_sort_function() if self.args.submitted: + logger.debug(f'Retrieving submitted posts of user {self.args.user}') generators.append( sort_function( self.reddit_instance.redditor(self.args.user).submissions, @@ -235,8 +238,10 @@ class RedditDownloader: raise errors.RedditAuthenticationError('Accessing user lists requires authentication') else: if self.args.upvoted: + logger.debug(f'Retrieving upvoted posts of user {self.args.user}') generators.append(self.reddit_instance.redditor(self.args.user).upvoted(limit=self.args.limit)) if self.args.saved: + logger.debug(f'Retrieving saved posts of user {self.args.user}') generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit)) return generators else: @@ -277,11 +282,11 @@ class RedditDownloader: def download(self): for generator in self.reddit_lists: for submission in generator: + logger.debug(f'Attempting to download submission {submission.id}') self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): if self.download_filter.check_url(submission.url): - logger.debug(f'Attempting to download submission {submission.id}') try: downloader_class = DownloadFactory.pull_lever(submission.url) @@ -293,11 +298,12 @@ class RedditDownloader: content = downloader.find_resources(self.authenticator) for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): - logger.debug(f'File already exists: {destination}') + logger.warning(f'File already exists: {destination}') else: res.download() if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: - logger.debug(f'Resource from {res.url} downloaded elsewhere') + logger.warning( + f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') else: # TODO: consider making a hard link/symlink here destination.parent.mkdir(parents=True, exist_ok=True) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 3575b00..5be0213 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # coding=utf-8 +import logging import re from pathlib import Path from typing import Optional @@ -10,6 +11,8 @@ import praw.models from bulkredditdownloader.exceptions import BulkDownloaderException from bulkredditdownloader.resource import Resource +logger = logging.getLogger(__name__) + class FileNameFormatter: key_terms = ('title', 'subreddit', 'redditor', 'postid', 'upvotes', 'flair', 'date') @@ -35,6 +38,7 @@ class FileNameFormatter: for key in submission_attributes.keys(): if re.search(r'(?i).*{{{}}}.*'.format(key), result): result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result) + logger.log(9, f'Found key string {key} in name') result = result.replace('/', '') return result @@ -42,14 +46,18 @@ class FileNameFormatter: def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) index = f'_{str(index)}' if index else '' - file_path = subfolder / (str(self._format_name(resource.source_submission, - self.file_format_string)) + index + resource.extension) + try: + file_path = subfolder / (str(self._format_name(resource.source_submission, + self.file_format_string)) + index + resource.extension) + except TypeError: + raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}') return file_path def format_resource_paths(self, resources: list[Resource], destination_directory: Path) -> list[tuple[Path, Resource]]: out = [] for i, res in enumerate(resources, start=1): + logger.log(9, f'Formatting filename with index {i}') out.append((self._format_path(res, destination_directory, i), res)) return out diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py index a29d907..9678b45 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bulkredditdownloader/oauth2.py @@ -70,12 +70,12 @@ class OAuth2Authenticator: server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server.bind(('localhost', 7634)) - logger.debug('Server listening on localhost:7634') + logger.log(9, 'Server listening on localhost:7634') server.listen(1) client = server.accept()[0] server.close() - logger.debug('Server closed') + logger.log(9, 'Server closed') return client @@ -95,7 +95,7 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager): if authorizer.refresh_token is None: if self.config.has_option('DEFAULT', 'user_token'): authorizer.refresh_token = self.config.get('DEFAULT', 'user_token') - logger.debug('Loaded OAuth2 token for authoriser') + logger.log(9, 'Loaded OAuth2 token for authoriser') else: raise RedditAuthenticationError('No auth token loaded in configuration') @@ -103,4 +103,4 @@ class OAuth2TokenManager(praw.reddit.BaseTokenManager): self.config.set('DEFAULT', 'user_token', authorizer.refresh_token) with open(self.config_location, 'w') as file: self.config.write(file, True) - logger.debug(f'Written OAuth2 token from authoriser to {self.config_location}') + logger.log(9, f'Written OAuth2 token from authoriser to {self.config_location}') diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index 30cbd3d..a93cc0c 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -2,6 +2,7 @@ # coding=utf-8 import hashlib +import logging import re import time from typing import Optional @@ -12,6 +13,8 @@ from praw.models import Submission from bulkredditdownloader.exceptions import BulkDownloaderException +logger = logging.getLogger(__name__) + class Resource: def __init__(self, source_submission: Submission, url: str, extension: str = None): @@ -32,10 +35,12 @@ class Resource: else: raise requests.exceptions.ConnectionError except requests.exceptions.ConnectionError: + logger.log(9, f'Error occured downloading resource, waiting {wait_time} seconds') time.sleep(wait_time) if wait_time < 300: return Resource.retry_download(url, wait_time + 60) else: + logger.error(f'Max wait time exceeded for resource at url {url}') return None def download(self): From 6d6327a39604714375c8e7e86be325797f1719f6 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 14:18:48 +1000 Subject: [PATCH 109/276] Add function to calculate all existing file hashes if wanted --- bulkredditdownloader/__main__.py | 1 + bulkredditdownloader/configuration.py | 1 + bulkredditdownloader/downloader.py | 16 +++++++++++++++- bulkredditdownloader/tests/test_downloader.py | 7 +++++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index d8d1f08..8a48958 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -39,6 +39,7 @@ def cli(): @click.option('--set-folder-scheme', default=None, type=str) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--config', type=str, default=None) +@click.option('--search-existing', is_flag=True, default=None) @click.pass_context def cli_download(context: click.Context, **_): config = Configuration() diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index a467cd1..6633ec2 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -19,6 +19,7 @@ class Configuration(Namespace): self.no_dupes: bool = False self.saved: bool = False self.search: Optional[str] = None + self.search_existing: bool = False self.set_file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' self.set_folder_scheme: str = '{SUBREDDIT}' self.skip: list[str] = [] diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 5b939b0..d8eb54d 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -2,7 +2,9 @@ # coding=utf-8 import configparser +import hashlib import logging +import os import re import socket from datetime import datetime @@ -116,7 +118,7 @@ class RedditDownloader: return master_list def _determine_directories(self): - self.download_directory = Path(self.args.directory) + self.download_directory = Path(self.args.directory).resolve().expanduser() self.logfile_directory = self.download_directory / 'LOG_FILES' self.config_directory = self.config_directories.user_config_dir @@ -313,3 +315,15 @@ class RedditDownloader: self.master_hash_list.append(res.hash.hexdigest()) logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') logger.info(f'Downloaded submission {submission.name}') + + def scan_existing_files(self) -> list[str]: + files = [] + for (dirpath, dirnames, filenames) in os.walk(self.download_directory): + files.extend([Path(dirpath, file) for file in filenames]) + logger.info(f'Calculating hashes for {len(files)} files') + hash_list = [] + for existing_file in files: + with open(existing_file, 'rb') as file: + hash_list.append(hashlib.md5(file.read()).hexdigest()) + logger.log(9, f'Hash calculated for file at {existing_file}') + return hash_list diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 2c4208f..dc7c427 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -407,3 +407,10 @@ def test_download_submission_hash_exists( def test_sanitise_subreddit_name(test_name: str, expected: str): result = RedditDownloader._sanitise_subreddit_name(test_name) assert result == expected + + +def test_search_existing_files(downloader_mock: MagicMock): + downloader_mock.download_directory = Path('.').resolve().expanduser() + results = RedditDownloader.scan_existing_files(downloader_mock) + assert all([isinstance(result, str) for result in results]) + assert len(results) >= 40 From 3e18997652da354e21f67e6af2e38be091a475fc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 15:25:44 +1000 Subject: [PATCH 110/276] Move logfiles into configuration directory on OS --- bulkredditdownloader/downloader.py | 7 +++---- bulkredditdownloader/tests/test_downloader.py | 3 --- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index d8eb54d..31fa1b3 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -119,11 +119,10 @@ class RedditDownloader: def _determine_directories(self): self.download_directory = Path(self.args.directory).resolve().expanduser() - self.logfile_directory = self.download_directory / 'LOG_FILES' - self.config_directory = self.config_directories.user_config_dir + self.config_directory = Path(self.config_directories.user_config_dir) self.download_directory.mkdir(exist_ok=True, parents=True) - self.logfile_directory.mkdir(exist_ok=True, parents=True) + self.config_directory.mkdir(exist_ok=True, parents=True) def _load_config(self): self.cfg_parser = configparser.ConfigParser() @@ -150,7 +149,7 @@ class RedditDownloader: def _create_file_logger(self): main_logger = logging.getLogger() - file_handler = logging.FileHandler(self.logfile_directory / 'log_output.txt') + file_handler = logging.FileHandler(Path(self.config_directory, 'log_output.txt'), mode='w') formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') file_handler.setFormatter(formatter) file_handler.setLevel(0) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index dc7c427..9aa32b9 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -44,10 +44,7 @@ def assert_all_results_are_submissions(result_limit: int, results: list[Iterator def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): downloader_mock.args.directory = tmp_path / 'test' RedditDownloader._determine_directories(downloader_mock) - assert Path(tmp_path / 'test').exists() - assert downloader_mock.logfile_directory == Path(tmp_path / 'test' / 'LOG_FILES') - assert downloader_mock.logfile_directory.exists() @pytest.mark.parametrize(('skip_extensions', 'skip_domains'), ( From 6657f0803d97f2482ab34bce62684423ae89c19c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 15:39:52 +1000 Subject: [PATCH 111/276] Fix bug with users in un-authenticated sessions --- bulkredditdownloader/downloader.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 31fa1b3..83447a6 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -67,14 +67,14 @@ class RedditDownloader: self.file_name_formatter = self._create_file_name_formatter() logger.log(9, 'Create file name formatter') - self._resolve_user_name() self._load_config() logger.debug(f'Configuration loaded from {self.config_location}') + self._create_reddit_instance() + self._resolve_user_name() self.master_hash_list = [] self.authenticator = self._create_authenticator() logger.log(9, 'Created site authenticator') - self._create_reddit_instance() def _create_reddit_instance(self): if self.args.authenticate: @@ -182,8 +182,12 @@ class RedditDownloader: def _resolve_user_name(self): if self.args.user == 'me': - self.args.user = self.reddit_instance.user.me().name - logger.log(9, f'Resolved user to {self.args.user}') + if self.authenticated: + self.args.user = self.reddit_instance.user.me().name + logger.log(9, f'Resolved user to {self.args.user}') + else: + self.args.user = None + logger.error('To use "me" as a user, an authenticated Reddit instance must be used') def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] From fc6e5872b283335c5c097cede6b25faaed10a44f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 15:40:46 +1000 Subject: [PATCH 112/276] Add log entry at end of program --- bulkredditdownloader/__main__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 8a48958..78d92e4 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -50,6 +50,8 @@ def cli_download(context: click.Context, **_): reddit_downloader.download() except BulkDownloaderException as e: logger.critical(f'An error occured {e}') + finally: + logger.info('Program complete') def _setup_logging(verbosity: int): From 2b9dc1b96cfd02e04f35e7ed456d8557de78a3fd Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 17:18:21 +1000 Subject: [PATCH 113/276] Allow subreddits and multireddits to fail individually --- bulkredditdownloader/downloader.py | 56 +++++++++++-------- bulkredditdownloader/tests/test_downloader.py | 18 ------ 2 files changed, 33 insertions(+), 41 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 83447a6..f90dbf8 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -14,6 +14,7 @@ from typing import Iterator import appdirs import praw +import praw.exceptions import praw.models import prawcore @@ -166,17 +167,26 @@ class RedditDownloader: def _get_subreddits(self) -> list[praw.models.ListingGenerator]: if self.args.subreddit: - subreddits = [self._sanitise_subreddit_name(subreddit) for subreddit in self.args.subreddit] - subreddits = [self.reddit_instance.subreddit(chosen_subreddit) for chosen_subreddit in subreddits] - if self.args.search: - return [ - reddit.search( - self.args.search, - sort=self.sort_filter.name.lower(), - limit=self.args.limit) for reddit in subreddits] - else: - sort_function = self._determine_sort_function() - return [sort_function(reddit, limit=self.args.limit) for reddit in subreddits] + out = [] + sort_function = self._determine_sort_function() + for reddit in self.args.subreddit: + try: + reddit = self._sanitise_subreddit_name(reddit) + reddit = self.reddit_instance.subreddit(reddit) + if self.args.search: + out.append( + reddit.search( + self.args.search, + sort=self.sort_filter.name.lower(), + limit=self.args.limit)) + logger.debug( + f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"') + else: + out.append(sort_function(reddit, limit=self.args.limit)) + logger.debug(f'Added submissions from subreddit {reddit}') + except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: + logger.error(f'Failed to get submissions for subreddit {reddit}: {e}') + return out else: return [] @@ -211,18 +221,18 @@ class RedditDownloader: def _get_multireddits(self) -> list[Iterator]: if self.args.multireddit: - if self.authenticated: - if self.args.user: - sort_function = self._determine_sort_function() - multireddits = [self._sanitise_subreddit_name(multi) for multi in self.args.multireddit] - return [ - sort_function(self.reddit_instance.multireddit( - self.args.user, - m_reddit_choice), limit=self.args.limit) for m_reddit_choice in multireddits] - else: - raise errors.BulkDownloaderException('A user must be provided to download a multireddit') - else: - raise errors.RedditAuthenticationError('Accessing multireddits requires authentication') + out = [] + sort_function = self._determine_sort_function() + for multi in self.args.multireddit: + try: + multi = self._sanitise_subreddit_name(multi) + out.append(sort_function( + self.reddit_instance.multireddit(self.args.user, multi), + limit=self.args.limit)) + logger.debug(f'Added submissions from multireddit {multi}') + except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: + logger.error(f'Failed to get submissions for multireddit {multi}: {e}') + return out else: return [] diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 9aa32b9..292da53 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -217,24 +217,6 @@ def test_get_multireddits_public( assert_all_results_are_submissions((limit * len(test_multireddits)) if limit else None, results) -@pytest.mark.online -@pytest.mark.reddit -def test_get_multireddits_no_user(downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.args.multireddit = ['test'] - with pytest.raises(BulkDownloaderException): - RedditDownloader._get_multireddits(downloader_mock) - - -@pytest.mark.online -@pytest.mark.reddit -def test_get_multireddits_not_authenticated(downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.args.multireddit = ['test'] - downloader_mock.authenticated = False - downloader_mock.reddit_instance = reddit_instance - with pytest.raises(RedditAuthenticationError): - RedditDownloader._get_multireddits(downloader_mock) - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_user', 'limit'), ( From af348a05dd7684051a430c9215d9a16e0192ab84 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 17:21:05 +1000 Subject: [PATCH 114/276] Add some integration tests --- .../tests/test_integration.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 bulkredditdownloader/tests/test_integration.py diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py new file mode 100644 index 0000000..f554e8b --- /dev/null +++ b/bulkredditdownloader/tests/test_integration.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from bulkredditdownloader.__main__ import cli + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-s', 'Mindustry', '-L', 1], + ['-s', 'r/Mindustry', '-L', 1], + ['-s', 'r/mindustry', '-L', 1], + ['-s', 'mindustry', '-L', 1], + ['-s', 'https://www.reddit.com/r/TrollXChromosomes/', '-L', 1], + ['-s', 'r/TrollXChromosomes/', '-L', 1], + ['-s', 'TrollXChromosomes/', '-L', 1], + ['-s', 'trollxchromosomes', '-L', 1], +)) +def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Added submissions from subreddit ' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g'], + ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], +)) +def test_cli_download_links(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert len(list(tmp_path.iterdir())) == 1 + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10], +)) +def test_cli_download_multireddit(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Added submissions from multireddit ' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'me', '--upvoted', '--authenticate', '-L', 10, '--set-folder-scheme', ''], + ['--user', 'me', '--saved', '--authenticate', '-L', 10, '--set-folder-scheme', ''], + ['--user', 'me', '--submitted', '--authenticate', '-L', 10, '--set-folder-scheme', ''], + ['--user', 'djnish', '--submitted', '-L', 10, '--set-folder-scheme', ''], +)) +def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Downloaded submission ' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'me', '-L', 10, '--set-folder-scheme', ''], +)) +def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'To use "me" as a user, an authenticated Reddit instance must be used' in result.output From 285d422c0ee2296652f0ee0f473df8abb322f2c2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 11 Mar 2021 22:18:54 +1000 Subject: [PATCH 115/276] Add some more integration tests --- bulkredditdownloader/downloader.py | 9 ++--- .../tests/test_integration.py | 35 ++++++++++++++++--- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index f90dbf8..2e00c42 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -226,11 +226,12 @@ class RedditDownloader: for multi in self.args.multireddit: try: multi = self._sanitise_subreddit_name(multi) - out.append(sort_function( - self.reddit_instance.multireddit(self.args.user, multi), - limit=self.args.limit)) + multi = self.reddit_instance.multireddit(self.args.user, multi) + if not multi.subreddits: + raise errors.BulkDownloaderException + out.append(sort_function(multi, limit=self.args.limit)) logger.debug(f'Added submissions from multireddit {multi}') - except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: + except (errors.BulkDownloaderException, praw.exceptions.PRAWException, prawcore.PrawcoreException) as e: logger.error(f'Failed to get submissions for multireddit {multi}: {e}') return out else: diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index f554e8b..beb9be1 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -21,6 +21,13 @@ from bulkredditdownloader.__main__ import cli ['-s', 'r/TrollXChromosomes/', '-L', 1], ['-s', 'TrollXChromosomes/', '-L', 1], ['-s', 'trollxchromosomes', '-L', 1], + ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day'], + ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], + ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], + ['-s', 'trollxchromosomes', '-L', 1, '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new', '--search', 'women'], + ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new', '--search', 'women'], )) def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -50,6 +57,9 @@ def test_cli_download_links(test_args: list[str], tmp_path: Path): @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10], + ['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10, '--sort', 'rising'], + ['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10, '--time', 'week'], + ['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10, '--time', 'week', '--sort', 'rising'], )) def test_cli_download_multireddit(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -59,15 +69,32 @@ def test_cli_download_multireddit(test_args: list[str], tmp_path: Path): assert 'Added submissions from multireddit ' in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'helen_darten', '-m', 'xxyyzzqwertty', '-L', 10], +)) +def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Failed to get submissions for multireddit xxyyzzqwerty' in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--user', 'me', '--upvoted', '--authenticate', '-L', 10, '--set-folder-scheme', ''], - ['--user', 'me', '--saved', '--authenticate', '-L', 10, '--set-folder-scheme', ''], - ['--user', 'me', '--submitted', '--authenticate', '-L', 10, '--set-folder-scheme', ''], - ['--user', 'djnish', '--submitted', '-L', 10, '--set-folder-scheme', ''], + ['--user', 'me', '--upvoted', '--authenticate', '-L', 10], + ['--user', 'me', '--saved', '--authenticate', '-L', 10], + ['--user', 'me', '--submitted', '--authenticate', '-L', 10], + ['--user', 'djnish', '--submitted', '-L', 10], + ['--user', 'djnish', '--submitted', '-L', 10, '--time', 'month'], + ['--user', 'djnish', '--submitted', '-L', 10, '--sort', 'controversial'], + ['--user', 'djnish', '--submitted', '-L', 10, '--sort', 'controversial', '--time', 'month'], )) def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path): runner = CliRunner() From ae0269e13bfd5b63d4fa45e690680d32e808a940 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 12 Mar 2021 13:24:25 +1000 Subject: [PATCH 116/276] Add option to search for files pre-emptively --- bulkredditdownloader/downloader.py | 58 ++++++++++--------- bulkredditdownloader/tests/test_downloader.py | 5 +- .../tests/test_integration.py | 21 ++++++- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 2e00c42..bf26bbd 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -74,6 +74,8 @@ class RedditDownloader: self._resolve_user_name() self.master_hash_list = [] + if self.args.search_existing: + self.master_hash_list.extend(self.scan_existing_files(self.download_directory)) self.authenticator = self._create_authenticator() logger.log(9, 'Created site authenticator') @@ -302,37 +304,39 @@ class RedditDownloader: self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): - if self.download_filter.check_url(submission.url): + if not self.download_filter.check_url(submission.url): + logger.debug(f'Download filter remove submission {submission.id} with URL {submission.url}') + return + try: + downloader_class = DownloadFactory.pull_lever(submission.url) + downloader = downloader_class(submission) + except errors.NotADownloadableLinkError as e: + logger.error(f'Could not download submission {submission.name}: {e}') + return - try: - downloader_class = DownloadFactory.pull_lever(submission.url) - downloader = downloader_class(submission) - except errors.NotADownloadableLinkError as e: - logger.error(f'Could not download submission {submission.name}: {e}') - return - - content = downloader.find_resources(self.authenticator) - for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): - if destination.exists(): - logger.warning(f'File already exists: {destination}') + content = downloader.find_resources(self.authenticator) + for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): + if destination.exists(): + logger.warning(f'File already exists: {destination}') + else: + res.download() + if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: + logger.warning( + f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') else: - res.download() - if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: - logger.warning( - f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') - else: - # TODO: consider making a hard link/symlink here - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug(f'Written file to {destination}') - self.master_hash_list.append(res.hash.hexdigest()) - logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') - logger.info(f'Downloaded submission {submission.name}') + # TODO: consider making a hard link/symlink here + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug(f'Written file to {destination}') + self.master_hash_list.append(res.hash.hexdigest()) + logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') + logger.info(f'Downloaded submission {submission.name}') - def scan_existing_files(self) -> list[str]: + @staticmethod + def scan_existing_files(directory: Path) -> list[str]: files = [] - for (dirpath, dirnames, filenames) in os.walk(self.download_directory): + for (dirpath, dirnames, filenames) in os.walk(directory): files.extend([Path(dirpath, file) for file in filenames]) logger.info(f'Calculating hashes for {len(files)} files') hash_list = [] diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 292da53..4721ede 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -388,8 +388,7 @@ def test_sanitise_subreddit_name(test_name: str, expected: str): assert result == expected -def test_search_existing_files(downloader_mock: MagicMock): - downloader_mock.download_directory = Path('.').resolve().expanduser() - results = RedditDownloader.scan_existing_files(downloader_mock) +def test_search_existing_files(): + results = RedditDownloader.scan_existing_files(Path('.')) assert all([isinstance(result, str) for result in results]) assert len(results) >= 40 diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index beb9be1..17067a1 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -73,14 +73,15 @@ def test_cli_download_multireddit(test_args: list[str], tmp_path: Path): @pytest.mark.reddit @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--user', 'helen_darten', '-m', 'xxyyzzqwertty', '-L', 10], + ['--user', 'helen_darten', '-m', 'xxyyzzqwerty', '-L', 10], )) def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert 'Failed to get submissions for multireddit xxyyzzqwerty' in result.output + assert 'Failed to get submissions for multireddit' in result.output + assert 'received 404 HTTP response' in result.output @pytest.mark.online @@ -117,3 +118,19 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'To use "me" as a user, an authenticated Reddit instance must be used' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.authenticated +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'python', '-L', 10, '--search-existing'], +)) +def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): + Path(tmp_path, 'test.txt').touch() + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Calculating hashes for' in result.output From ba6cf420964867bf80dac30fc0fcbcea3527b78a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 12 Mar 2021 13:29:12 +1000 Subject: [PATCH 117/276] Add shortened Youtube links to downloader factory --- bulkredditdownloader/site_downloaders/download_factory.py | 2 +- bulkredditdownloader/tests/downloaders/test_download_factory.py | 1 + bulkredditdownloader/tests/downloaders/test_youtube.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index db6adca..253e13b 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -38,7 +38,7 @@ class DownloadFactory: return SelfPost elif re.match(url_beginning + r'v\.redd\.it', url): return VReddit - elif re.match(url_beginning + r'youtube', url): + elif re.match(url_beginning + r'youtu\.?be', url): return Youtube elif re.match(url_beginning + r'i\.redd\.it.*', url) or re.match(url_beginning + r'.*\..{3,4}$', url): return Direct diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 393e239..9d6624f 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -35,6 +35,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube ('lupb4r', Youtube), ('lul6l7', Redgifs), ('luu376', GifDeliveryNetwork), + ('m2l5oo', Youtube), )) def test_factory_lever_good(test_submission_id: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): submission = reddit_instance.submission(id=test_submission_id) diff --git a/bulkredditdownloader/tests/downloaders/test_youtube.py b/bulkredditdownloader/tests/downloaders/test_youtube.py index 144de18..08d30fc 100644 --- a/bulkredditdownloader/tests/downloaders/test_youtube.py +++ b/bulkredditdownloader/tests/downloaders/test_youtube.py @@ -13,6 +13,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube @pytest.mark.long @pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( ('ltnoqp', '468136300a106c67f1463a7011a6db4a'), + ('m2l5oo', 'a70512f7782f13922258297bb12055d9'), )) def test_find_resources(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From 4d91cf7c0f0bcc33b68e1873d3c3f39d702f52d2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 11:13:36 +1000 Subject: [PATCH 118/276] Shorten filenames that are too long --- bulkredditdownloader/file_name_formatter.py | 15 +++++++++-- .../tests/test_file_name_formatter.py | 25 +++++++++++++++++++ .../tests/test_integration.py | 1 + 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 5be0213..ecd71be 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -46,13 +46,24 @@ class FileNameFormatter: def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) index = f'_{str(index)}' if index else '' + if not resource.extension: + raise BulkDownloaderException(f'Resource from {resource.url} has no extension') + ending = index + resource.extension + file_name = str(self._format_name(resource.source_submission, self.file_format_string)) + file_name = self._limit_file_name_length(file_name, ending) try: - file_path = subfolder / (str(self._format_name(resource.source_submission, - self.file_format_string)) + index + resource.extension) + file_path = Path(subfolder, file_name) except TypeError: raise BulkDownloaderException(f'Could not determine path name: {subfolder}, {index}, {resource.extension}') return file_path + @staticmethod + def _limit_file_name_length(filename: str, ending: str) -> str: + max_length = 255 - len(ending) + if len(filename) > max_length: + filename = filename[:max_length] + return filename + ending + def format_resource_paths(self, resources: list[Resource], destination_directory: Path) -> list[tuple[Path, Resource]]: out = [] diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 533d570..2e9d7bc 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -125,3 +125,28 @@ def test_format_multiple_resources(): results = test_formatter.format_resource_paths(mocks, Path('.')) results = set([str(res[0]) for res in results]) assert results == {'test_1.png', 'test_2.png', 'test_3.png', 'test_4.png'} + + +@pytest.mark.parametrize(('test_filename', 'test_ending'), ( + ('A' * 300, '.png'), + ('A' * 300, '_1.png'), + ('a' * 300, '_1000.jpeg'), +)) +def test_limit_filename_length(test_filename: str, test_ending: str): + result = FileNameFormatter._limit_file_name_length(test_filename, test_ending) + assert len(result) <= 255 + + +@pytest.mark.online +@pytest.mark.reddit +def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path): + test_submission = MagicMock() + test_submission.title = 'A' * 300 + test_submission.author.name = 'test' + test_submission.subreddit.display_name = 'test' + test_submission.id = 'BBBBBB' + test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg') + test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}') + result = test_formatter._format_path(test_resource, tmp_path) + result.parent.mkdir(parents=True) + result.touch() diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index 17067a1..1cc7a63 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -43,6 +43,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): @pytest.mark.parametrize('test_args', ( ['-l', 'm2601g'], ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], + ['-l', 'm3hxzd'], # Really long title used to overflow filename limit )) def test_cli_download_links(test_args: list[str], tmp_path: Path): runner = CliRunner() From 9417e0cc04a84828afff4d6ccbefb0f2a1490138 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 12:01:10 +1000 Subject: [PATCH 119/276] Remove bad encapsulating try-except --- bulkredditdownloader/__main__.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 78d92e4..f4a9d32 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -45,13 +45,9 @@ def cli_download(context: click.Context, **_): config = Configuration() config.process_click_arguments(context) _setup_logging(config.verbose) - try: - reddit_downloader = RedditDownloader(config) - reddit_downloader.download() - except BulkDownloaderException as e: - logger.critical(f'An error occured {e}') - finally: - logger.info('Program complete') + reddit_downloader = RedditDownloader(config) + reddit_downloader.download() + logger.info('Program complete') def _setup_logging(verbosity: int): From d977595bded6c2a5cc80c022791438365305ada2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 12:01:30 +1000 Subject: [PATCH 120/276] Add error catch for youtube and site downloaders --- bulkredditdownloader/downloader.py | 9 ++++++--- bulkredditdownloader/site_downloaders/youtube.py | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index bf26bbd..ad4dcbf 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -314,15 +314,18 @@ class RedditDownloader: logger.error(f'Could not download submission {submission.name}: {e}') return - content = downloader.find_resources(self.authenticator) + try: + content = downloader.find_resources(self.authenticator) + except errors.SiteDownloaderError: + logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}') + return for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): logger.warning(f'File already exists: {destination}') else: res.download() if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: - logger.warning( - f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') + logger.warning(f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') else: # TODO: consider making a hard link/symlink here destination.parent.mkdir(parents=True, exist_ok=True) diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index cd2034b..c1db496 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -8,6 +8,7 @@ from typing import Optional import youtube_dl from praw.models import Submission +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -33,8 +34,11 @@ class Youtube(BaseDownloader): with tempfile.TemporaryDirectory() as temp_dir: download_path = Path(temp_dir).resolve() ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s' - with youtube_dl.YoutubeDL(ytdl_options) as ydl: - ydl.download([self.post.url]) + try: + with youtube_dl.YoutubeDL(ytdl_options) as ydl: + ydl.download([self.post.url]) + except youtube_dl.DownloadError as e: + raise SiteDownloaderError(f'Youtube download failed: {e}') downloaded_file = list(download_path.iterdir())[0] extension = downloaded_file.suffix From f9809caa4228d798e9b9ecf9ae487e3f3b282b85 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 12:23:59 +1000 Subject: [PATCH 121/276] Fix typo --- bulkredditdownloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index ad4dcbf..91d4426 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -305,7 +305,7 @@ class RedditDownloader: def _download_submission(self, submission: praw.models.Submission): if not self.download_filter.check_url(submission.url): - logger.debug(f'Download filter remove submission {submission.id} with URL {submission.url}') + logger.debug(f'Download filter removed submission {submission.id} with URL {submission.url}') return try: downloader_class = DownloadFactory.pull_lever(submission.url) From 36b6aafbc1d1c8cf7991004c3f67064ae7052e36 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 12:39:54 +1000 Subject: [PATCH 122/276] Limit name byte length --- bulkredditdownloader/file_name_formatter.py | 7 ++++--- bulkredditdownloader/tests/test_file_name_formatter.py | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index ecd71be..8003c6a 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -59,9 +59,10 @@ class FileNameFormatter: @staticmethod def _limit_file_name_length(filename: str, ending: str) -> str: - max_length = 255 - len(ending) - if len(filename) > max_length: - filename = filename[:max_length] + max_length_chars = 255 - len(ending) + max_length_bytes = 255 - len(ending.encode('utf-8')) + while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes: + filename = filename[:-1] return filename + ending def format_resource_paths(self, resources: list[Resource], diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 2e9d7bc..ba8042d 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -131,10 +131,13 @@ def test_format_multiple_resources(): ('A' * 300, '.png'), ('A' * 300, '_1.png'), ('a' * 300, '_1000.jpeg'), + ('😍💕✨' * 100, '_1.png'), )) def test_limit_filename_length(test_filename: str, test_ending: str): result = FileNameFormatter._limit_file_name_length(test_filename, test_ending) assert len(result) <= 255 + assert len(result.encode('utf-8')) <= 255 + assert isinstance(result, str) @pytest.mark.online From 56966ea6b4e5f868bf42070480063ab2f0ec2aa0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 12:58:27 +1000 Subject: [PATCH 123/276] Fix bug where user is deleted --- bulkredditdownloader/file_name_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 8003c6a..ffae54b 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -28,7 +28,7 @@ class FileNameFormatter: submission_attributes = { 'title': submission.title, 'subreddit': submission.subreddit.display_name, - 'redditor': submission.author.name, + 'redditor': submission.author.name if submission.author else 'DELETED', 'postid': submission.id, 'upvotes': submission.score, 'flair': submission.link_flair_text, From b8faffaf1fdac74df444b732babc604ab467a94b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 13:22:22 +1000 Subject: [PATCH 124/276] Add a couple more integration tests --- .../tests/test_integration.py | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index 1cc7a63..6c19b3b 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -44,6 +44,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): ['-l', 'm2601g'], ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], ['-l', 'm3hxzd'], # Really long title used to overflow filename limit + ['-l', 'm3kua3'], # Has a deleted user )) def test_cli_download_links(test_args: list[str], tmp_path: Path): runner = CliRunner() @@ -123,7 +124,6 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp @pytest.mark.online @pytest.mark.reddit -@pytest.mark.authenticated @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--subreddit', 'python', '-L', 10, '--search-existing'], @@ -135,3 +135,31 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert 'Calculating hashes for' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'], +)) +def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Download filter removed submission' in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.long +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'all', '-L', '100'], +)) +def test_cli_download_long(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 From 1b191e7a0ed7f17ebb4ed8978aff153b9952c58e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 13:36:18 +1000 Subject: [PATCH 125/276] Add logging entry --- bulkredditdownloader/downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 91d4426..1900ed3 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -310,6 +310,7 @@ class RedditDownloader: try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) + logger.debug(f'Using {downloader_class.__name__} with url {submission.url}') except errors.NotADownloadableLinkError as e: logger.error(f'Could not download submission {submission.name}: {e}') return From 8e3e5a62d9d8456062430b31f67f59f835bac01e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 14:27:23 +1000 Subject: [PATCH 126/276] Start rewrite of README --- README.md | 298 +++++++++++++++++++----------------------------------- 1 file changed, 105 insertions(+), 193 deletions(-) diff --git a/README.md b/README.md index 9d7286a..9b3daac 100644 --- a/README.md +++ b/README.md @@ -1,213 +1,125 @@ -# [Bulk Downloader for Reddit v2-beta](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/v2) is out! -[Serene-Arc](https://github.com/Serene-Arc) has reimplemented the Bulk Downloader for Reddit so that it is more flexible, roboust and is easier to contribute. If you are having issues with master, v2 is worth checking out. After cloning the repository, switch to the branch *v2* with `git checkout v2` +# Bulk Downloader for Reddit -# 📥 Bulk Downloader for Reddit +This is a tool to download data from Reddit. -Downloads reddit posts. Made by [u/aliparlakci](https://reddit.com/u/aliparlakci) - -Please give feedback *(errors, feature requests, etc.)* on the [Issues](https://github.com/aliparlakci/bulk-downloader-for-reddit/issues) page. I will try to resolve them ASAP. +## Usage -## [Download the latest release here](https://github.com/aliparlakci/bulk-downloader-for-reddit/releases/latest) +The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. -## 🚀 How to use -If you run **Windows**, after you extract the zip file, double-click on the *bulk-downloader-for-reddit.exe*. The program will guide you through. Also, take a look at the [Setting up the program](#🔨-setting-up-the-program) section. **However**, Bulk Dowloader for Reddit has a plenty of features which can only be activated via command line arguments. See [Options](#⚙-Options) for it. +Many websites and links are supported: -Unfortunately, there is no binary for **MacOS** or **Linux**. If you are a MacOS or Linux user, you must use the program from the source code. See the [Interpret from source code](docs/INTERPRET_FROM_SOURCE.md) page. - -However, binary version for Linux is being worked. So, stay tuned. - -OR, regardless of your operating system, you can fire up the program from the **source code**. + - Direct Links (links leading to a file) + - Erome + - Gfycat + - Gif Delivery Network + - Imgur + - Reddit Galleries + - Reddit Text Posts + - Reddit Videos + - Redgifs + - Youtube -### `python3 -m pip install -r requirements.txt` +## Options -### `python3 script.py` +The following options are common between both the `archive` and `download` commands of the BDFR. -See the [Interpret from source code](docs/INTERPRET_FROM_SOURCE.md) page for more information. +- `directory` + - This is the directory to which the BDFR will download and place all files +- `--authenticate` + - This flag will make the BDFR attempt to use an authenticated Reddit session + - See[Authentication](#authentication) for more details +- `--config` + - If the path to a configuration file is supplied with this option, the BDFR will use the specified config + - See[Configuration Files](#configuration-files) for more details +- `--saved` + - This option will make the BDFR use the supplied user's saved posts list as a download source + - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` +- `--search` + - This will apply the specified search term to specific lists when scraping submissions + - A search term can only be applied to subreddits and multireddits, supplied with the `- s` and `-m` flags respectively +- `--submitted` + - This will use a user's submissions as a source + - A user must be specified with `--user` +- `--upvoted` + - This will use a user's upvoted posts as a source of posts to scrape + - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` +- `-L, --limit` + - This is the limit on the number of submissions retrieve + - Note that this limit applies to **each source individually** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped + - If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We cannot bypass this.** +- `-S, --sort` + - This is the sort type for each applicable submission source supplied to the BDFR + - This option does not apply to upvoted or saved posts when scraping from these sources + - The following options are available: + - `controversial` + - `hot` + - `new` + - `relevance` (only available when using `--search`) + - `rising` + - `top` +- `-l, --link` + - This is a direct link to a submission to download, either as a URL or an ID + - Can be specified multiple times +- `-m, --multireddit` + - This is the name of a multireddit to add as a source + - Can be specified multiple times + - The specified multireddits must all belong to the user specified with the `--user` option +- `-s, --subreddit` + - This adds a subreddit as a source + - Can be used mutliple times +- `-t, --time` + - This is the time filter that will be applied to all applicable sources + - This option does not apply to upvoted or saved posts when scraping from these sources + - The following options are available: + - `all` + - `hour` + - `day` + - `week` + - `month` + - `year` +- `-u, --user` + - This specifies the user to scrape in concert with other options + - When using `--authenticate`, `--user me` can be used to refer to the authenticated user +- `-v, --verbose` + - Increases the verbosity of the program + - Can be specified multiple times -## 🔨 Setting up the program -### 📽 ffmpeg Library - -Program needs **ffmpeg software** to add audio to some video files. However, installing it is **voluntary**. Although the program can still run with no errors without the ffmpeg library, some video files might have no sound. - -Install it through a package manager such as **Chocolatey** in Windows, **apt** in Linux or **Homebrew** in MacOS: +### Downloader Options -- **in Windows**: After you **[install Chocolatey](https://chocolatey.org/install)**, type **`choco install ffmpeg`** in either Command Promt or Powershell. -- **in Linux**: Type **`sudo apt install ffmpeg`** in Terminal. -- **in MacOS**: After you **[install Homebrew](https://brew.sh/)**, type **`brew install ffmpeg`** in Terminal +The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. -OR, [Download ffmpeg](https://www.ffmpeg.org/download.html) manually on your system and [add the bin folder in the downloaded folder's directory to `PATH` of your system.](https://www.architectryan.com/2018/03/17/add-to-the-path-on-windows-10/) However, package manager option is suggested. +- `--no-dupes` + - This flag will not redownload files if they already exist somewhere in the root folder + - This is calculated by MD5 hash +- `--search-existing` + - This will make the BDFR compile the hashes for every file in `directory` and store them to remove duplicates if `--no-dupes` is also supplied +- `--set-file-scheme` + - Sets the scheme for files + - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details +- `--set-folder-scheme` + - Sets the scheme for folders + - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details +- `--skip-domain` + - This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded + - Can be specified multiple times +- `--skip` + - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded + - Can be specified multiple times -## 🐋 Docker -There is also a complete ready to go Docker integration. Install **Docker** and **docker-compose**. Then run the following command from the repository root: -### `docker-compose run --service-ports bdfr` -And you'll find youself right in the app. The files will be downloaded to `downloads/`. Since it is docker, you may want to change the ownership of the files once you're done (belongs to root by default). +## Authentication -_Credits to [wAuner](https://github.com/wAuner)_ +The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token - based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. -## ⚙ Options +To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on. -Some of the below features are available only through command-line. - -Open the [Command Promt](https://youtu.be/bgSSJQolR0E?t=18), [Powershell](https://youtu.be/bgSSJQolR0E?t=18) or [Terminal](https://youtu.be/Pz4yHAB3G8w?t=31) in the folder that contains bulk-downloader-for-reddit file (click on the links to see how) - -After you type **`bulk-downloader-for-reddit.exe`**, type the preffered options. +## Changing Permissions -Example: **`bulk-downloader-for-reddit.exe --subreddit pics --sort top --limit 10`** +Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it is allowed by the BDFR. -## **`--subreddit`** -Downloads posts from given subreddit(s). Takes number of subreddit names as a paramater. - -Example usage: **`--subreddit IAmA pics --sort hot --limit 10`** +The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your token were to be compromised, however unlikely that actually is . Never grant more permissions than you absolutely need. -## **`--multireddit`** -Downloads posts from given subreddit. Takes a single multireddit name as a parameter. **`--user`** option is required. - -Example usage: **`--multireddit myMulti --user me --sort top --time week`** +For more details on the configuration file and the values therein, see[Configuration Files](#configuration-files). -## **`--search`** -Searches for given query in given subreddit(s) or multireddit. Takes a search query as a parameter. **`--subreddit`** or **`--multireddit`** option is required. **`--sort`** option is required. - -Example usage: **`--search carter --subreddit funny`** - -## **`--submitted`** -Downloads given redditor's submitted posts. Does not take any parameter. **`--user`** option is required. +## Folder and File Name Schemes -Example usage: **`--submitted --user spɛz --sort top --time week`** - -## **`--upvoted`** -Downloads given redditor's upvoted posts. Does not take any parameter. **`--user`** option is required. - -Example usage: **`--upvoted --user spɛz`** - -## **`--saved`** -Downloads logged in redditor's saved posts. Does not take any parameter. Example usage: **`--saved`** - -## **`--link`** -Takes a reddit link as a parameter and downloads the posts in the link. Put the link in " " (double quotes). - -Example usage: **`--link "https://www.reddit.com/r/funny/comments/25blmh/"`** - -## **`--log`** -Program saves the found posts into POSTS.json file and the failed posts to FAILED.json file in LOG_FILES folder. You can use those files to redownload the posts inside them. - -Uses a .json file to redownload posts from. Takes single directory to a .json file as a parameter. - -Example usage: **`--log D:\pics\LOG_FILES\FAILED.json`** - ---- - -## **`--user`** -Takes a reddit username as a parameter. Example usage: **`--user spɛz`** - -## **`--sort`** -Takes a valid sorting type as a parameter. Valid sort types are `hot`, `top`, `new`, `rising`, `controversial` and `relevance` (if you are using `--search` option) - -Example usage: **`--sort top`** - -## **`--time`** -Takes a valid time as a parameter. Valid times are `hour`, `day`, `week`, `month`, `year` and `all`. Example usage: **`--time all`** - -## **`--limit`** -Takes a number to specify how many should program get. Upper bound is 1000 posts for **each** subreddit. For example, if you are downloading posts from pics and IAmA, the upper bound is 2000. Do not use the option to set it to highest bound possible. - -Example usage: **`--limit 500`** - ---- - -## **`--skip`** -Takes a number of file types as a parameter to skip the posts from those domains. Valid file types are `images`, `videos`, `gifs`, `self` - -Example usage: **`--skip self videos`** - -## **`--skip-domain`** -Takes a number of domains as a parameter to skip the posts from those domains. - -Example usage: **`--skip v.redd.it youtube.com youtu.be`** - -## **`--quit`** -Automatically quits the application after it finishes. Otherwise, it will wait for an input to quit. - -Example usage: **`--quit`** - -## **`--directory`** -Takes a directory which the posts should be downloaded to. Overrides the given default directory. Use `..\` to imply upper level and `.\` to imply the current level. - -Example usage: **`--directory D:\bdfr\`** -Example usage: **`--directory ..\images\`** -Example usage: **`-d ..\images\`** -Example usage: **`-d .\`** - -## **`--set-filename`** -Starts the program to set a filename template to use for downloading posts. **Does not take any parameter.** - -When the programs starts, you will be prompted to type a filename template. Use `SUBREDDIT`, `REDDITOR`, `POSTID`, `TITLE`, `UPVOTES`, `FLAIR`, `DATE` in curly brakets `{ }` to refer to the corrosponding property of a post. - -❗ Do NOT change the filename structure frequently. If you did, the program could not find duplicates and would download the already downloaded files again. This would not create any duplicates in the directory but the program would not be as snappy as it should be. - -The default filename template is **`{REDDITOR}_{TITLE}_{POSTID}`** - -Example usage: **`--set-filename`** - -## **`--set-folderpath`** -Starts the program to set a folder structure to use for downloading posts. **Does not take any parameter.** - -When the programs starts, you will be prompted to type a filename template. Use `SUBREDDIT`, `REDDITOR`, `POSTID`, `TITLE`, `UPVOTES`, `FLAIR`, `DATE` in curly brakets `{ }` to refer to the corrosponding property of a post. Do not put slashes `/` or backslashes `\` at either ends. For instance, **`{REDDITOR}/{SUBREDDIT}/{FLAIR}`** - -The default filename template is **`{SUBREDDIT}`** - -Example usage: **`--set-folderpath`** - -## **`--set-default-directory`** -Starts the program to set a default directory to use in case no directory is given. **Does not take any parameter.** - -When the programs starts, you will be prompted to type a default directory. You can use {time} in foler names to use to timestamp it. For instance, **`D:\bdfr\posts_{time}`** - -Example usage: **`--set-default-directory`** - -## **`--use-local-config`** -Sets the program to use config.json file in the current directory. Creates it if it does not exists. Useful for having different configurations. **Does not take any parameter.** - -Example usage: **`--use-local-config`** - -## **`--no-dupes`** -Skips the same posts in different subreddits. Does not take any parameter. - -Example usage: **`--no-dupes`** - -## **`--no-download`** -Quits the program without downloading the posts. Does not take any parameter - -Example usage: **`--no-download`** - -## **`--downloaded-posts`** -Takes a file directory as a parameter and skips the posts if it matches with the post IDs inside the file. It also saves the newly downloaded posts to the given file. - -Example usage: **`--downloaded-posts D:\bdfr\ALL_POSTS.txt`** - -## **`--downloaded-delay`** -When specified, it delays every download for given seconds. - -## ❔ FAQ - -### I am running the script on a headless machine or on a remote server. How can I authenticate my reddit account? -- Download the script on your everday computer and run it for once. -- Authenticate the program on both reddit and imgur. -- Go to your Home folder (for Windows users it is `C:\Users\[USERNAME]\`, for Linux users it is `/home/[USERNAME]`) -- Copy the *config.json* file inside the Bulk Downloader for Reddit folder and paste it **next to** the file that you run the program. - -### How can I change my credentials? -- All of the user data is held in **config.json** file which is in a folder named "Bulk Downloader for Reddit" in your **Home** directory. You can edit them, there. - - Also if you already have a config.json file, you can paste it **next to** the script and override the one on your Home directory. - -### What do the dots resemble when getting posts? -- Each dot means that 100 posts are scanned. - -### Getting posts takes too long. -- You can press *Ctrl+C* to interrupt it and start downloading. - -### How do I open self post files? -- Self posts are held at reddit as styled with markdown. So, the script downloads them as they are in order not to lose their stylings. - However, there is a [great Chrome extension](https://chrome.google.com/webstore/detail/markdown-viewer/ckkdlimhmcjmikdlpkmbgfkaikojcbjk) for viewing Markdown files with its styling. Install it and open the files with [Chrome](https://www.google.com/intl/tr/chrome/). - - However, they are basically text files. You can also view them with any text editor such as Notepad on Windows, gedit on Linux or Text Editor on MacOS. +## Configuration Files From a93813ca45ca11fc3a4ff2ca4e97b9fa821e0121 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 15:11:59 +1000 Subject: [PATCH 127/276] Fix some wrong pytest marks --- bulkredditdownloader/tests/downloaders/test_youtube.py | 2 +- bulkredditdownloader/tests/test_integration.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_youtube.py b/bulkredditdownloader/tests/downloaders/test_youtube.py index 08d30fc..0b4e982 100644 --- a/bulkredditdownloader/tests/downloaders/test_youtube.py +++ b/bulkredditdownloader/tests/downloaders/test_youtube.py @@ -10,7 +10,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube @pytest.mark.online @pytest.mark.reddit -@pytest.mark.long +@pytest.mark.slow @pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( ('ltnoqp', '468136300a106c67f1463a7011a6db4a'), ('m2l5oo', 'a70512f7782f13922258297bb12055d9'), diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index 6c19b3b..ca62766 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -153,10 +153,10 @@ def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.long +@pytest.mark.slow @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--subreddit', 'all', '-L', '100'], + ['--subreddit', 'all', '-L', '100', '--sort', 'new'], )) def test_cli_download_long(test_args: list[str], tmp_path: Path): runner = CliRunner() From 959b55a939e04b857f51aeb14571efa887d786cd Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 13 Mar 2021 20:18:30 +1000 Subject: [PATCH 128/276] Add beginning of archiver --- bulkredditdownloader/archive_entry.py | 67 +++++++++++++++++++ bulkredditdownloader/archiver.py | 51 ++++++++++++++ bulkredditdownloader/configuration.py | 3 + bulkredditdownloader/exceptions.py | 4 ++ bulkredditdownloader/file_name_formatter.py | 4 +- .../tests/test_archive_entry.py | 32 +++++++++ bulkredditdownloader/tests/test_archiver.py | 36 ++++++++++ .../tests/test_file_name_formatter.py | 6 +- 8 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 bulkredditdownloader/archive_entry.py create mode 100644 bulkredditdownloader/archiver.py create mode 100644 bulkredditdownloader/tests/test_archive_entry.py create mode 100644 bulkredditdownloader/tests/test_archiver.py diff --git a/bulkredditdownloader/archive_entry.py b/bulkredditdownloader/archive_entry.py new file mode 100644 index 0000000..a223c66 --- /dev/null +++ b/bulkredditdownloader/archive_entry.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging + +import praw.models + +logger = logging.getLogger(__name__) + + +class ArchiveEntry: + def __init__(self, submission: praw.models.Submission): + self.submission = submission + self.comments: list[dict] = [] + self.post_details: dict = {} + + def compile(self) -> dict: + self._fill_entry() + out = self.post_details + out['comments'] = self.comments + return out + + def _fill_entry(self): + self._get_comments() + self._get_post_details() + + def _get_post_details(self): + self.post_details = { + 'title': self.submission.title, + 'name': self.submission.name, + 'url': self.submission.url, + 'selftext': self.submission.selftext, + 'score': self.submission.score, + 'upvote_ratio': self.submission.upvote_ratio, + 'permalink': self.submission.permalink, + 'id': self.submission.id, + 'author': self.submission.author.name if self.submission.author else 'DELETED', + 'link_flair_text': self.submission.link_flair_text, + 'num_comments': self.submission.num_comments, + 'over_18': self.submission.over_18, + } + + def _get_comments(self): + logger.debug(f'Retrieving full comment tree for submission {self.submission.id}') + self.submission.comments.replace_more(0) + for top_level_comment in self.submission.comments: + self.comments.append(self._convert_comment_to_dict(top_level_comment)) + + @staticmethod + def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict: + out_dict = { + 'author': in_comment.author.name if in_comment.author else 'DELETED', + 'id': in_comment.id, + 'score': in_comment.score, + 'subreddit': in_comment.subreddit.display_name, + 'submission': in_comment.submission.id, + 'stickied': in_comment.stickied, + 'body': in_comment.body, + 'is_submitter': in_comment.is_submitter, + 'created_utc': in_comment.created_utc, + 'parent_id': in_comment.parent_id, + 'replies': [], + } + in_comment.replies.replace_more(0) + for reply in in_comment.replies: + out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply)) + return out_dict diff --git a/bulkredditdownloader/archiver.py b/bulkredditdownloader/archiver.py new file mode 100644 index 0000000..a29aaee --- /dev/null +++ b/bulkredditdownloader/archiver.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import json +import logging + +import praw.models + +from bulkredditdownloader.archive_entry import ArchiveEntry +from bulkredditdownloader.configuration import Configuration +from bulkredditdownloader.downloader import RedditDownloader +from bulkredditdownloader.exceptions import ArchiverError +from bulkredditdownloader.resource import Resource + +logger = logging.getLogger(__name__) + + +class Archiver(RedditDownloader): + def __init__(self, args: Configuration): + super(Archiver, self).__init__(args) + + def download(self): + for generator in self.reddit_lists: + for submission in generator: + logger.debug(f'Attempting to archive submission {submission.id}') + self._write_submission(submission) + + def _write_submission(self, submission: praw.models.Submission): + archive_entry = ArchiveEntry(submission) + if self.args.format == 'json': + self._write_submission_json(archive_entry) + elif self.args.format == 'xml': + self._write_submission_xml(archive_entry) + elif self.args.format == 'yaml': + self._write_submission_yaml(archive_entry) + else: + raise ArchiverError(f'Unknown format {self.args.format} given') + logger.info(f'Record for submission {submission.id} written to disk') + + def _write_submission_json(self, entry: ArchiveEntry): + resource = Resource(entry.submission, '', '.json') + file_path = self.file_name_formatter.format_path(resource, self.download_directory) + with open(file_path, 'w') as file: + logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}') + json.dump(entry.compile(), file) + + def _write_submission_xml(self, entry: ArchiveEntry): + raise NotImplementedError + + def _write_submission_yaml(self, entry: ArchiveEntry): + raise NotImplementedError diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 6633ec2..09d1b8a 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -32,6 +32,9 @@ class Configuration(Namespace): self.user: Optional[str] = None self.verbose: int = 0 + # Archiver-specific options + self.format = 'json' + def process_click_arguments(self, context: click.Context): for arg_key in context.params.keys(): if arg_key in vars(self) and context.params[arg_key] is not None: diff --git a/bulkredditdownloader/exceptions.py b/bulkredditdownloader/exceptions.py index 703ffaa..91fda2c 100644 --- a/bulkredditdownloader/exceptions.py +++ b/bulkredditdownloader/exceptions.py @@ -12,6 +12,10 @@ class RedditAuthenticationError(RedditUserError): pass +class ArchiverError(BulkDownloaderException): + pass + + class SiteDownloaderError(BulkDownloaderException): pass diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index ffae54b..1950306 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -43,7 +43,7 @@ class FileNameFormatter: result = result.replace('/', '') return result - def _format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: + def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) index = f'_{str(index)}' if index else '' if not resource.extension: @@ -70,7 +70,7 @@ class FileNameFormatter: out = [] for i, res in enumerate(resources, start=1): logger.log(9, f'Formatting filename with index {i}') - out.append((self._format_path(res, destination_directory, i), res)) + out.append((self.format_path(res, destination_directory, i), res)) return out @ staticmethod diff --git a/bulkredditdownloader/tests/test_archive_entry.py b/bulkredditdownloader/tests/test_archive_entry.py new file mode 100644 index 0000000..dba5732 --- /dev/null +++ b/bulkredditdownloader/tests/test_archive_entry.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import pytest + +from bulkredditdownloader.archive_entry import ArchiveEntry + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'min_comments'), ( + ('m3reby', 27), +)) +def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + test_archive_entry = ArchiveEntry(test_submission) + test_archive_entry._get_comments() + assert len(test_archive_entry.comments) >= min_comments + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'expected_dict'), ( + ('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}), + ('m3kua3', {'author': 'DELETED'}), +)) +def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + test_archive_entry = ArchiveEntry(test_submission) + test_archive_entry._get_post_details() + assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()]) diff --git a/bulkredditdownloader/tests/test_archiver.py b/bulkredditdownloader/tests/test_archiver.py new file mode 100644 index 0000000..7c497ff --- /dev/null +++ b/bulkredditdownloader/tests/test_archiver.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from pathlib import Path +from unittest.mock import MagicMock + +import praw +import pytest + +from bulkredditdownloader.archive_entry import ArchiveEntry +from bulkredditdownloader.archiver import Archiver + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_submission_id', ( + 'm3reby', +)) +def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): + archiver_mock = MagicMock() + test_path = Path(tmp_path, 'test.json') + test_submission = reddit_instance.submission(id=test_submission_id) + archiver_mock.file_name_formatter.format_path.return_value = test_path + test_entry = ArchiveEntry(test_submission) + Archiver._write_submission_json(archiver_mock, test_entry) + assert test_path.exists() + + +@pytest.mark.skip +def test_write_submission_xml(): + raise NotImplementedError + + +@pytest.mark.skip +def test_write_submission_yaml(): + raise NotImplementedError diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index ba8042d..3b79904 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -88,7 +88,7 @@ def test_format_full( reddit_submission: praw.models.Submission): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory) - result = test_formatter._format_path(test_resource, Path('test')) + result = test_formatter.format_path(test_resource, Path('test')) assert str(result) == expected @@ -109,7 +109,7 @@ def test_format_full_with_index_suffix( reddit_submission: praw.models.Submission): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory) - result = test_formatter._format_path(test_resource, Path('test'), index) + result = test_formatter.format_path(test_resource, Path('test'), index) assert str(result) == expected @@ -150,6 +150,6 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path): test_submission.id = 'BBBBBB' test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg') test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}') - result = test_formatter._format_path(test_resource, tmp_path) + result = test_formatter.format_path(test_resource, tmp_path) result.parent.mkdir(parents=True) result.touch() From 1bf1db707cb44535fa2b733c93c1adc467bdf34f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 09:00:00 +1000 Subject: [PATCH 129/276] Add XML and YAML to archiver --- bulkredditdownloader/archiver.py | 15 ++++++++-- bulkredditdownloader/tests/test_archiver.py | 32 +++++++++++++++++---- requirements.txt | 2 ++ 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/bulkredditdownloader/archiver.py b/bulkredditdownloader/archiver.py index a29aaee..d7ad086 100644 --- a/bulkredditdownloader/archiver.py +++ b/bulkredditdownloader/archiver.py @@ -4,7 +4,9 @@ import json import logging +import dict2xml import praw.models +import yaml from bulkredditdownloader.archive_entry import ArchiveEntry from bulkredditdownloader.configuration import Configuration @@ -45,7 +47,16 @@ class Archiver(RedditDownloader): json.dump(entry.compile(), file) def _write_submission_xml(self, entry: ArchiveEntry): - raise NotImplementedError + resource = Resource(entry.submission, '', '.xml') + file_path = self.file_name_formatter.format_path(resource, self.download_directory) + with open(file_path, 'w') as file: + logger.debug(f'Writing submission {entry.submission.id} to file in XML format at {file_path}') + xml_entry = dict2xml.dict2xml(entry.compile(), wrap='root') + file.write(xml_entry) def _write_submission_yaml(self, entry: ArchiveEntry): - raise NotImplementedError + resource = Resource(entry.submission, '', '.yaml') + file_path = self.file_name_formatter.format_path(resource, self.download_directory) + with open(file_path, 'w') as file: + logger.debug(f'Writing submission {entry.submission.id} to file in YAML format at {file_path}') + yaml.dump(entry.compile(), file) diff --git a/bulkredditdownloader/tests/test_archiver.py b/bulkredditdownloader/tests/test_archiver.py index 7c497ff..a2da7c5 100644 --- a/bulkredditdownloader/tests/test_archiver.py +++ b/bulkredditdownloader/tests/test_archiver.py @@ -26,11 +26,31 @@ def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_i assert test_path.exists() -@pytest.mark.skip -def test_write_submission_xml(): - raise NotImplementedError +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_submission_id', ( + 'm3reby', +)) +def test_write_submission_xml(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): + archiver_mock = MagicMock() + test_path = Path(tmp_path, 'test.xml') + test_submission = reddit_instance.submission(id=test_submission_id) + archiver_mock.file_name_formatter.format_path.return_value = test_path + test_entry = ArchiveEntry(test_submission) + Archiver._write_submission_xml(archiver_mock, test_entry) + assert test_path.exists() -@pytest.mark.skip -def test_write_submission_yaml(): - raise NotImplementedError +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize('test_submission_id', ( + 'm3reby', +)) +def test_write_submission_yaml(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): + archiver_mock = MagicMock() + test_path = Path(tmp_path, 'test.yaml') + test_submission = reddit_instance.submission(id=test_submission_id) + archiver_mock.file_name_formatter.format_path.return_value = test_path + test_entry = ArchiveEntry(test_submission) + Archiver._write_submission_yaml(archiver_mock, test_entry) + assert test_path.exists() diff --git a/requirements.txt b/requirements.txt index 814a67c..291cf96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,9 @@ appdirs bs4 click +dict2xml ffmpeg-python praw +pyyaml requests youtube-dl \ No newline at end of file From c2d3cfd50f0865bccdd4c4b104fcd22d61405d63 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 11:10:26 +1000 Subject: [PATCH 130/276] Add more tests for file name formatter --- .../tests/test_file_name_formatter.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 3b79904..b376e9d 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -92,6 +92,22 @@ def test_format_full( assert str(result) == expected +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('format_string_directory', 'format_string_file'), ( + ('{SUBREDDIT}', '{POSTID}'), + ('{SUBREDDIT}', '{UPVOTES}'), + ('{SUBREDDIT}', '{UPVOTES}{POSTID}'), +)) +def test_format_full_conform( + format_string_directory: str, + format_string_file: str, + reddit_submission: praw.models.Submission): + test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') + test_formatter = FileNameFormatter(format_string_file, format_string_directory) + test_formatter.format_path(test_resource, Path('test')) + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'index', 'expected'), From b08c31a1dba9bb6568d7289a9f69844301b4ed33 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 11:11:37 +1000 Subject: [PATCH 131/276] Add integration tests for archiver --- bulkredditdownloader/__main__.py | 65 +++++++++++++------ bulkredditdownloader/archiver.py | 3 + .../tests/test_integration.py | 36 ++++++++++ 3 files changed, 83 insertions(+), 21 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index f4a9d32..ce60885 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -5,12 +5,37 @@ import sys import click +from bulkredditdownloader.archiver import Archiver from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.downloader import RedditDownloader -from bulkredditdownloader.exceptions import BulkDownloaderException logger = logging.getLogger() +_common_options = [ + click.argument('directory', type=str), + click.option('--config', type=str, default=None), + click.option('-v', '--verbose', default=None, count=True), + click.option('-l', '--link', multiple=True, default=None, type=str), + click.option('-s', '--subreddit', multiple=True, default=None, type=str), + click.option('-m', '--multireddit', multiple=True, default=None, type=str), + click.option('-L', '--limit', default=None, type=int), + click.option('--authenticate', is_flag=True, default=None), + click.option('--submitted', is_flag=True, default=None), + click.option('--upvoted', is_flag=True, default=None), + click.option('--saved', is_flag=True, default=None), + click.option('--search', default=None, type=str), + click.option('-u', '--user', type=str, default=None), + click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None), + click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', + 'controversial', 'rising', 'relevance')), default=None), +] + + +def _add_common_options(func): + for opt in _common_options: + func = opt(func) + return func + @click.group() def cli(): @@ -18,28 +43,13 @@ def cli(): @cli.command('download') -@click.argument('directory', type=str) -@click.option('-v', '--verbose', default=None, count=True) -@click.option('-l', '--link', multiple=True, default=None, type=str) -@click.option('-s', '--subreddit', multiple=True, default=None, type=str) -@click.option('-m', '--multireddit', multiple=True, default=None, type=str) -@click.option('-L', '--limit', default=None, type=int) -@click.option('--authenticate', is_flag=True, default=None) -@click.option('--submitted', is_flag=True, default=None) -@click.option('--upvoted', is_flag=True, default=None) -@click.option('--saved', is_flag=True, default=None) -@click.option('--search', default=None, type=str) -@click.option('-u', '--user', type=str, default=None) -@click.option('-t', '--time', type=click.Choice(('all', 'hour', 'day', 'week', 'month', 'year')), default=None) -@click.option('-S', '--sort', type=click.Choice(('hot', 'top', 'new', - 'controversial', 'rising', 'relevance')), default=None) -@click.option('--skip', default=None, multiple=True) -@click.option('--skip-domain', default=None, multiple=True) +@click.option('--no-dupes', is_flag=True, default=None) +@click.option('--search-existing', is_flag=True, default=None) @click.option('--set-file-scheme', default=None, type=str) @click.option('--set-folder-scheme', default=None, type=str) -@click.option('--no-dupes', is_flag=True, default=None) -@click.option('--config', type=str, default=None) -@click.option('--search-existing', is_flag=True, default=None) +@click.option('--skip', default=None, multiple=True) +@click.option('--skip-domain', default=None, multiple=True) +@_add_common_options @click.pass_context def cli_download(context: click.Context, **_): config = Configuration() @@ -50,6 +60,19 @@ def cli_download(context: click.Context, **_): logger.info('Program complete') +@cli.command('archive') +@_add_common_options +@click.option('-f,', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None) +@click.pass_context +def cli_archive(context: click.Context, **_): + config = Configuration() + config.process_click_arguments(context) + _setup_logging(config.verbose) + reddit_archiver = Archiver(config) + reddit_archiver.download() + logger.info('Program complete') + + def _setup_logging(verbosity: int): logger.setLevel(1) stream = logging.StreamHandler(sys.stdout) diff --git a/bulkredditdownloader/archiver.py b/bulkredditdownloader/archiver.py index d7ad086..0d0df66 100644 --- a/bulkredditdownloader/archiver.py +++ b/bulkredditdownloader/archiver.py @@ -42,6 +42,7 @@ class Archiver(RedditDownloader): def _write_submission_json(self, entry: ArchiveEntry): resource = Resource(entry.submission, '', '.json') file_path = self.file_name_formatter.format_path(resource, self.download_directory) + file_path.parent.mkdir(exist_ok=True, parents=True) with open(file_path, 'w') as file: logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}') json.dump(entry.compile(), file) @@ -49,6 +50,7 @@ class Archiver(RedditDownloader): def _write_submission_xml(self, entry: ArchiveEntry): resource = Resource(entry.submission, '', '.xml') file_path = self.file_name_formatter.format_path(resource, self.download_directory) + file_path.parent.mkdir(exist_ok=True, parents=True) with open(file_path, 'w') as file: logger.debug(f'Writing submission {entry.submission.id} to file in XML format at {file_path}') xml_entry = dict2xml.dict2xml(entry.compile(), wrap='root') @@ -57,6 +59,7 @@ class Archiver(RedditDownloader): def _write_submission_yaml(self, entry: ArchiveEntry): resource = Resource(entry.submission, '', '.yaml') file_path = self.file_name_formatter.format_path(resource, self.download_directory) + file_path.parent.mkdir(exist_ok=True, parents=True) with open(file_path, 'w') as file: logger.debug(f'Writing submission {entry.submission.id} to file in YAML format at {file_path}') yaml.dump(entry.compile(), file) diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index ca62766..644dd07 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # coding=utf-8 +import re from pathlib import Path import pytest @@ -163,3 +164,38 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) assert result.exit_code == 0 + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'Mindustry', '-L', 25], + ['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'], + ['--subreddit', 'Mindustry', '-L', 25, '--format', 'yaml'], + ['--subreddit', 'Mindustry', '-L', 25, '--sort', 'new'], + ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day'], + ['--subreddit', 'Mindustry', '-L', 25, '--time', 'day', '--sort', 'new'], +)) +def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing submission .*? to file in .*? format', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.slow +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--subreddit', 'all', '-L', 100], + ['--subreddit', 'all', '-L', 100, '--sort', 'new'], +)) +def test_cli_archive_long(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing submission .*? to file in .*? format', result.output) From 447855cb749405d0fcff665d175abc7aff36c09c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 11:41:52 +1000 Subject: [PATCH 132/276] Fix config directory name --- bulkredditdownloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 1900ed3..c1c2b5b 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -49,7 +49,7 @@ class RedditTypes: class RedditDownloader: def __init__(self, args: Configuration): self.args = args - self.config_directories = appdirs.AppDirs('bulk_reddit_downloader', 'BDFR') + self.config_directories = appdirs.AppDirs('bulkredditdownloader', 'BDFR') self.run_time = datetime.now().isoformat() self._setup_internal_objects() From 9e546a67181d3964059fa7c46845c4b450b1c770 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 11:49:47 +1000 Subject: [PATCH 133/276] Add more to README --- README.md | 74 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 9b3daac..68d92d7 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,15 @@ This is a tool to download data from Reddit. -## Usage +# Usage The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. -Many websites and links are supported: +There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc linked. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as well as all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. - - Direct Links (links leading to a file) +Many websites and links are supported for the downloader: + + - Direct Links(links leading to a file) - Erome - Gfycat - Gif Delivery Network @@ -19,7 +21,7 @@ Many websites and links are supported: - Redgifs - Youtube -## Options +# Options The following options are common between both the `archive` and `download` commands of the BDFR. @@ -30,7 +32,7 @@ The following options are common between both the `archive` and `download` comma - See[Authentication](#authentication) for more details - `--config` - If the path to a configuration file is supplied with this option, the BDFR will use the specified config - - See[Configuration Files](#configuration-files) for more details + - See[Configuration Files](#configuration) for more details - `--saved` - This option will make the BDFR use the supplied user's saved posts list as a download source - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` @@ -45,14 +47,15 @@ The following options are common between both the `archive` and `download` comma - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` - `-L, --limit` - This is the limit on the number of submissions retrieve - - Note that this limit applies to **each source individually** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped + - Default is max possible + - Note that this limit applies to ** each source individually ** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped - If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We cannot bypass this.** - `-S, --sort` - This is the sort type for each applicable submission source supplied to the BDFR - This option does not apply to upvoted or saved posts when scraping from these sources - The following options are available: - `controversial` - - `hot` + - `hot` (default) - `new` - `relevance` (only available when using `--search`) - `rising` @@ -71,7 +74,7 @@ The following options are common between both the `archive` and `download` comma - This is the time filter that will be applied to all applicable sources - This option does not apply to upvoted or saved posts when scraping from these sources - The following options are available: - - `all` + - `all` (default) - `hour` - `day` - `week` @@ -84,20 +87,22 @@ The following options are common between both the `archive` and `download` comma - Increases the verbosity of the program - Can be specified multiple times -### Downloader Options +# Downloader Options The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. - `--no-dupes` - - This flag will not redownload files if they already exist somewhere in the root folder + - This flag will not redownload files if they already exist somewhere in the root folder tree - This is calculated by MD5 hash - `--search-existing` - This will make the BDFR compile the hashes for every file in `directory` and store them to remove duplicates if `--no-dupes` is also supplied - `--set-file-scheme` - Sets the scheme for files + - Default is `{REDDITOR}_{TITLE}_{POSTID}` - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details - `--set-folder-scheme` - Sets the scheme for folders + - Default is `{SUBREDDIT}` - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details - `--skip-domain` - This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded @@ -106,20 +111,59 @@ The following options apply only to the `download` command. This command downloa - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded - Can be specified multiple times -## Authentication +# Archiver Options + +The following options are for the `archive` command specifically. + +- `-f, --format` + - This specifies the format of the data file saved to disk + - The following formats are available: + - `json` (default) + - `xml` + - `yaml` + +# Authentication The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token - based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on. -## Changing Permissions +# Changing Permissions Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it is allowed by the BDFR. The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your token were to be compromised, however unlikely that actually is . Never grant more permissions than you absolutely need. -For more details on the configuration file and the values therein, see[Configuration Files](#configuration-files). +For more details on the configuration file and the values therein, see[Configuration Files](#configuration). -## Folder and File Name Schemes +# Folder and File Name Schemes -## Configuration Files +The naming and folder schemes for the BDFR are both completely customisable. A number of different fields can be given which will be replaced with properties from a submission when downloading it. The scheme format takes the form of `{KEY}`, where `KEY` is a string from the below list. + + - `DATE` + - `FLAIR` + - `POSTID` + - `REDDITOR` + - `SUBREDDIT` + - `TITLE` + - `UPVOTES` + +Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every downloaded post with the unique submission ID, you can use `{POSTID}`. Statis strings can also be included, such as `download_{POSTID}` which will not change from submission to submission. + +At least one key * must * be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. + +# Configuration + +The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be `C:\Documents and Settings\\Application Data\Local Settings\BDFR\bulkredditdownloader` or `C:\Documents and Settings\\Application Data\BDFR\bulkredditdownloader`. On Mac OSX, this will be `~/Library/Application Support/bulkredditdownloader`. Lastly, on a Linux system, this will be `~/.local/share/bulkredditdownloader`. + +The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report. + +# Configuration File + +The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys ** must ** be included in the configuration file supplied. + + - `client_id` + - `client_secret` + - `scopes` + +All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default. From 7c1b2da05c013efe375ca587a3b88e4353c42bca Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 11:51:22 +1000 Subject: [PATCH 134/276] Remove unused files --- Dockerfile | 27 --------------------------- _config.yml | 1 - docker-compose.yml | 17 ----------------- docs/INTERPRET_FROM_SOURCE.md | 35 ----------------------------------- 4 files changed, 80 deletions(-) delete mode 100644 Dockerfile delete mode 100644 _config.yml delete mode 100644 docker-compose.yml delete mode 100644 docs/INTERPRET_FROM_SOURCE.md diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index eb6a0a8..0000000 --- a/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM python:3.9 -LABEL Description="This image enables running Buld Downloader for Reddit with in a container environment" Version="0.0.1" - -ENV PYTHONUNBUFFERED 1 -ENV PYTHONDONTWRITEBYTECODE 1 - -EXPOSE 8080 -EXPOSE 7634 - -# Install dependencies -RUN apt-get update \ - && apt-get install -y build-essential \ - && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ - && rm -rf /var/lib/apt/lists/* - -# Python requirements -COPY requirements.txt /requirements.txt -RUN pip install --no-cache-dir -r /requirements.txt \ - && rm -rf /requirements.txt - -# Copy over project files -COPY . /bdfr -WORKDIR /bdfr - -# Useful so the image doubles as reference to the binary -ENTRYPOINT ["python", "script.py"] -CMD ["python", "script.py", "-d", "downloads"] diff --git a/_config.yml b/_config.yml deleted file mode 100644 index c419263..0000000 --- a/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-cayman \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index da3afb4..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,17 +0,0 @@ -version: "3" - -services: - - bdfr: - build: - context: . - dockerfile: ./Dockerfile - image: bdfr - container_name: bdfr - ports: - - "8080:8080" - - "7634:7634" - volumes: - - .:/bdfr:z - container_name: bdfr_container - network_mode: bridge diff --git a/docs/INTERPRET_FROM_SOURCE.md b/docs/INTERPRET_FROM_SOURCE.md deleted file mode 100644 index f0a20e3..0000000 --- a/docs/INTERPRET_FROM_SOURCE.md +++ /dev/null @@ -1,35 +0,0 @@ -# Interpret from source code -## Requirements -### 🐍 Python 3 Interpreter -- Python 3 is required. See if it is already installed, [here](#finding-the-correct-keyword-for-python). -- If not, download the matching release for your platform [here](https://www.python.org/downloads/) and install it. If you are a *Windows* user, selecting **Add Python 3 to PATH** option when installing the software is **mandatory**. - -### 📃 Source Code -[Download the repository](https://github.com/aliparlakci/bulk-downloader-for-reddit/archive/master.zip) and extract the zip into a folder. - -## 💻 Using the command line -Open the [Command Promt](https://youtu.be/bgSSJQolR0E?t=18), [Powershell](https://youtu.be/bgSSJQolR0E?t=18) or [Terminal](https://youtu.be/Pz4yHAB3G8w?t=31) in the folder that contains the script.py file (click on the links to see how) - -### Finding the correct keyword for Python -Enter these lines to the terminal window until it prints out the a version starting with **`3.`**: - -- `python --version` -- `python3 --version` -- `py --version` -- `py -3 --version` - -Once it does, your keyword is without the `--version` part. - -## 📦 Installing dependencies -Enter the line below to terminal window when you are in the directory where script.py is, use your keyword instead of `python`: -```console -python -m pip install -r requirements.txt -``` - -## 🏃‍♂️ Running the code -Type below code into command line inside the program folder, use your keyword instead of `python`: -```console -python script.py -``` - -The program should guide you through. **However**, you can also use custom options. See [Options](../README.md#⚙-Options) \ No newline at end of file From 61ba21639d9dc5cefcedb6e975d0699d805c7a0a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 11:52:40 +1000 Subject: [PATCH 135/276] Move docs to doc directory --- ARCHITECTURE.md => docs/ARCHITECTURE.md | 0 docs/CONTRIBUTING.md | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename ARCHITECTURE.md => docs/ARCHITECTURE.md (100%) create mode 100644 docs/CONTRIBUTING.md diff --git a/ARCHITECTURE.md b/docs/ARCHITECTURE.md similarity index 100% rename from ARCHITECTURE.md rename to docs/ARCHITECTURE.md diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 100644 index 0000000..e69de29 From 06261cc5cd6f34936b8ee2daa89fc759b29a9c31 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 12:03:43 +1000 Subject: [PATCH 136/276] Add some more integration tests --- bulkredditdownloader/downloader.py | 14 +++++---- .../tests/test_integration.py | 31 +++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index c1c2b5b..d4b47f2 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -137,8 +137,9 @@ class RedditDownloader: else: logger.error(f'Could not find config file at {self.args.config}, attempting to find elsewhere') possible_paths = [Path('./config.cfg'), - Path(self.config_directory, 'config.cfg'), Path('./default_config.cfg'), + Path(self.config_directory, 'config.cfg'), + Path(self.config_directory, 'default_config.cfg'), ] self.config_location = None for path in possible_paths: @@ -243,17 +244,17 @@ class RedditDownloader: if any([self.args.submitted, self.args.upvoted, self.args.saved]): if self.args.user: if not self._check_user_existence(self.args.user): - raise errors.RedditUserError(f'User {self.args.user} does not exist') + logger.error(f'User {self.args.user} does not exist') + return [] generators = [] sort_function = self._determine_sort_function() if self.args.submitted: logger.debug(f'Retrieving submitted posts of user {self.args.user}') generators.append( sort_function( - self.reddit_instance.redditor(self.args.user).submissions, - limit=self.args.limit)) + self.reddit_instance.redditor(self.args.user).submissions, limit=self.args.limit)) if not self.authenticated and any((self.args.upvoted, self.args.saved)): - raise errors.RedditAuthenticationError('Accessing user lists requires authentication') + logger.error('Accessing user lists requires authentication') else: if self.args.upvoted: logger.debug(f'Retrieving upvoted posts of user {self.args.user}') @@ -263,7 +264,8 @@ class RedditDownloader: generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit)) return generators else: - raise errors.BulkDownloaderException('A user must be supplied to download user data') + logger.error('A user must be supplied to download user data') + return [] else: return [] diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index 644dd07..d025828 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -199,3 +199,34 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert re.search(r'Writing submission .*? to file in .*? format', result.output) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.slow +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'sdclhgsolgjeroij', '--submitted', '-L', 10], + ['--user', 'me', '--upvoted', '-L', 10], + ['--user', 'sdclhgsolgjeroij', '--upvoted', '-L', 10], +)) +def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.slow +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--time', 'random'], + ['--sort', 'random'], +)) +def test_cli_download_hard_fail(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code != 0 From e042d985b5475b41229f72f70cead507e5cf1328 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 14 Mar 2021 20:38:03 +1000 Subject: [PATCH 137/276] Load default config from package directory --- .../default_config.cfg | 0 bulkredditdownloader/downloader.py | 2 ++ bulkredditdownloader/tests/test_integration.py | 7 +++++++ 3 files changed, 9 insertions(+) rename default_config.cfg => bulkredditdownloader/default_config.cfg (100%) diff --git a/default_config.cfg b/bulkredditdownloader/default_config.cfg similarity index 100% rename from default_config.cfg rename to bulkredditdownloader/default_config.cfg diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index d4b47f2..3cf1f78 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -3,6 +3,7 @@ import configparser import hashlib +import importlib.resources import logging import os import re @@ -140,6 +141,7 @@ class RedditDownloader: Path('./default_config.cfg'), Path(self.config_directory, 'config.cfg'), Path(self.config_directory, 'default_config.cfg'), + list(importlib.resources.path('bulkredditdownloader', 'default_config.cfg').gen)[0], ] self.config_location = None for path in possible_paths: diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index d025828..dab80c8 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -230,3 +230,10 @@ def test_cli_download_hard_fail(test_args: list[str], tmp_path: Path): test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) assert result.exit_code != 0 + + +def test_cli_download_use_default_config(tmp_path: Path): + runner = CliRunner() + test_args = ['download', '-vv', str(tmp_path)] + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 From 4cd25bcad9c8941e2423a49722cad5fa3c842b3f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Mar 2021 10:44:29 +1000 Subject: [PATCH 138/276] Add basic setup files --- setup.cfg | 7 +++++++ setup.py | 52 ++++------------------------------------------------ 2 files changed, 11 insertions(+), 48 deletions(-) create mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..eb3ac3d --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[metadata] +name = Bulk Downloader for Reddit +author = Ali Parlakci +author-email = parlakciali@gmail.com + +[files] +packages = bulkredditdownloader diff --git a/setup.py b/setup.py index a8c413f..b571f29 100644 --- a/setup.py +++ b/setup.py @@ -1,50 +1,6 @@ -#!C:\Users\Ali\AppData\Local\Programs\Python\Python36\python.exe +#!/usr/bin/env python3 +# encoding=utf-8 -# python setup.py build -import sys +from setuptools import setup -from cx_Freeze import Executable, setup - -from bulkredditdownloader.__main__ import __version__ - -options = { - "build_exe": { - "packages": [ - "idna", "praw", "requests", "multiprocessing" - ] - } -} - -if sys.platform == "win32": - executables = [Executable( - "script.py", - targetName="bulk-downloader-for-reddit.exe", - shortcutName="Bulk Downloader for Reddit", - shortcutDir="DesktopFolder" - )] - -elif sys.platform == "linux": - executables = [Executable( - "script.py", - targetName="bulk-downloader-for-reddit", - shortcutName="Bulk Downloader for Reddit", - shortcutDir="DesktopFolder" - )] - -setup( - name="Bulk Downloader for Reddit", - version=__version__, - description="Bulk Downloader for Reddit", - author="Ali Parlakci", - author_email="parlakciali@gmail.com", - url="https://github.com/aliparlakci/bulk-downloader-for-reddit", - classifiers=( - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU General Public License v3 (GPLv3)" - "Natural Language :: English", - "Environment :: Console", - "Operating System :: OS Independent", - ), - executables=executables, - options=options -) +setup(setup_requires=['pbr', 'appdirs'], pbr=True, data_files=[('config', ['bulkredditdownloader/default_config.cfg'])]) From 28b7deb6d3c22a80c2f7d21bb695e69973b6ceea Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Mar 2021 12:37:37 +1000 Subject: [PATCH 139/276] Rename function --- bulkredditdownloader/__main__.py | 6 +++--- bulkredditdownloader/tests/test_downloader.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index ce60885..6d24303 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -54,7 +54,7 @@ def cli(): def cli_download(context: click.Context, **_): config = Configuration() config.process_click_arguments(context) - _setup_logging(config.verbose) + setup_logging(config.verbose) reddit_downloader = RedditDownloader(config) reddit_downloader.download() logger.info('Program complete') @@ -67,13 +67,13 @@ def cli_download(context: click.Context, **_): def cli_archive(context: click.Context, **_): config = Configuration() config.process_click_arguments(context) - _setup_logging(config.verbose) + setup_logging(config.verbose) reddit_archiver = Archiver(config) reddit_archiver.download() logger.info('Program complete') -def _setup_logging(verbosity: int): +def setup_logging(verbosity: int): logger.setLevel(1) stream = logging.StreamHandler(sys.stdout) formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 4721ede..aae921e 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -11,7 +11,7 @@ import praw import praw.models import pytest -from bulkredditdownloader.__main__ import _setup_logging +from bulkredditdownloader.__main__ import setup_logging from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.downloader import RedditDownloader, RedditTypes @@ -333,7 +333,7 @@ def test_download_submission_file_exists( reddit_instance: praw.Reddit, tmp_path: Path, capsys: pytest.CaptureFixture): - _setup_logging(3) + setup_logging(3) downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True downloader_mock.args.set_folder_scheme = '' @@ -356,7 +356,7 @@ def test_download_submission_hash_exists( reddit_instance: praw.Reddit, tmp_path: Path, capsys: pytest.CaptureFixture): - _setup_logging(3) + setup_logging(3) downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True downloader_mock.args.set_folder_scheme = '' From 6aab009204993df351bf57d120dc2bbc53468f9e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Mar 2021 13:22:41 +1000 Subject: [PATCH 140/276] Update RedditDownloader tests --- bulkredditdownloader/downloader.py | 17 +++-- bulkredditdownloader/tests/test_downloader.py | 66 +++++-------------- 2 files changed, 30 insertions(+), 53 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 3cf1f78..67c481a 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -167,16 +167,24 @@ class RedditDownloader: pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)(?:/)?$') match = re.match(pattern, subreddit) if not match: - raise errors.RedditAuthenticationError('') + raise errors.BulkDownloaderException(f'Could not find subreddit name in string {subreddit}') return match.group(1) + @staticmethod + def _split_args_input(subreddit_entries: list[str]) -> set[str]: + all_subreddits = [] + split_pattern = re.compile(r'[,;]\s?') + for entry in subreddit_entries: + results = re.split(split_pattern, entry) + all_subreddits.extend([RedditDownloader._sanitise_subreddit_name(name) for name in results]) + return set(all_subreddits) + def _get_subreddits(self) -> list[praw.models.ListingGenerator]: if self.args.subreddit: out = [] sort_function = self._determine_sort_function() - for reddit in self.args.subreddit: + for reddit in self._split_args_input(self.args.subreddit): try: - reddit = self._sanitise_subreddit_name(reddit) reddit = self.reddit_instance.subreddit(reddit) if self.args.search: out.append( @@ -228,9 +236,8 @@ class RedditDownloader: if self.args.multireddit: out = [] sort_function = self._determine_sort_function() - for multi in self.args.multireddit: + for multi in self._split_args_input(self.args.multireddit): try: - multi = self._sanitise_subreddit_name(multi) multi = self.reddit_instance.multireddit(self.args.user, multi) if not multi.subreddits: raise errors.BulkDownloaderException diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index aae921e..d4a2e03 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -30,6 +30,8 @@ def args() -> Configuration: def downloader_mock(args: argparse.Namespace): mock_downloader = MagicMock() mock_downloader.args = args + mock_downloader._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name + mock_downloader._split_args_input = RedditDownloader._split_args_input return mock_downloader @@ -146,6 +148,7 @@ def test_get_submissions_from_link( @pytest.mark.reddit @pytest.mark.parametrize(('test_subreddits', 'limit'), ( (('Futurology',), 10), + (('Futurology', 'Mindustry, Python'), 10), (('Futurology',), 20), (('Futurology', 'Python'), 10), (('Futurology',), 100), @@ -157,13 +160,14 @@ def test_get_subreddit_normal( downloader_mock: MagicMock, reddit_instance: praw.Reddit): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name downloader_mock.args.limit = limit downloader_mock.args.subreddit = test_subreddits downloader_mock.reddit_instance = reddit_instance downloader_mock.sort_filter = RedditTypes.SortType.HOT results = RedditDownloader._get_subreddits(downloader_mock) - results = assert_all_results_are_submissions((limit * len(test_subreddits)) if limit else None, results) + test_subreddits = downloader_mock._split_args_input(test_subreddits) + results = assert_all_results_are_submissions( + (limit * len(test_subreddits)) if limit else None, results) assert all([res.subreddit.display_name in test_subreddits for res in results]) @@ -181,7 +185,6 @@ def test_get_subreddit_search( downloader_mock: MagicMock, reddit_instance: praw.Reddit): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name downloader_mock.args.limit = limit downloader_mock.args.search = search_term downloader_mock.args.subreddit = test_subreddits @@ -207,7 +210,6 @@ def test_get_multireddits_public( reddit_instance: praw.Reddit, downloader_mock: MagicMock): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.limit = limit downloader_mock.args.multireddit = test_multireddits @@ -237,30 +239,6 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic assert all([res.author.name == test_user for res in results]) -@pytest.mark.online -@pytest.mark.reddit -def test_get_user_no_user(downloader_mock: MagicMock): - downloader_mock.args.upvoted = True - with pytest.raises(BulkDownloaderException): - RedditDownloader._get_user_data(downloader_mock) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize('test_user', ( - 'rockcanopicjartheme', - 'exceptionalcatfishracecarbatter', -)) -def test_get_user_nonexistent_user(test_user: str, downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.reddit_instance = reddit_instance - downloader_mock.args.user = test_user - downloader_mock.args.upvoted = True - downloader_mock._check_user_existence.return_value = RedditDownloader._check_user_existence( - downloader_mock, test_user) - with pytest.raises(RedditUserError): - RedditDownloader._get_user_data(downloader_mock) - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated @@ -276,16 +254,6 @@ def test_get_user_upvoted(downloader_mock: MagicMock, authenticated_reddit_insta assert_all_results_are_submissions(10, results) -@pytest.mark.online -@pytest.mark.reddit -def test_get_user_upvoted_unauthenticated(downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.args.user = 'random' - downloader_mock.args.upvoted = True - downloader_mock.authenticated = False - with pytest.raises(RedditAuthenticationError): - RedditDownloader._get_user_data(downloader_mock) - - @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated @@ -301,16 +269,6 @@ def test_get_user_saved(downloader_mock: MagicMock, authenticated_reddit_instanc assert_all_results_are_submissions(10, results) -@pytest.mark.online -@pytest.mark.reddit -def test_get_user_saved_unauthenticated(downloader_mock: MagicMock, reddit_instance: praw.Reddit): - downloader_mock.args.user = 'random' - downloader_mock.args.saved = True - downloader_mock.authenticated = False - with pytest.raises(RedditAuthenticationError): - RedditDownloader._get_user_data(downloader_mock) - - @pytest.mark.online @pytest.mark.reddit def test_download_submission(downloader_mock: MagicMock, reddit_instance: praw.Reddit, tmp_path: Path): @@ -392,3 +350,15 @@ def test_search_existing_files(): results = RedditDownloader.scan_existing_files(Path('.')) assert all([isinstance(result, str) for result in results]) assert len(results) >= 40 + + +@pytest.mark.parametrize(('test_subreddit_entries', 'expected'), ( + (['test1', 'test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1,test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1, test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1; test2', 'test3'], {'test1', 'test2', 'test3'}), + (['test1, test2', 'test1,test2,test3', 'test4'], {'test1', 'test2', 'test3', 'test4'}) +)) +def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]): + results = RedditDownloader._split_args_input(test_subreddit_entries) + assert results == expected From 3a093d08443df2a1cdeabc6a2b37ade524ce82c4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 15 Mar 2021 14:00:21 +1000 Subject: [PATCH 141/276] Add shortcut in download for certain errors --- bulkredditdownloader/downloader.py | 7 ++++++- bulkredditdownloader/resource.py | 3 +++ bulkredditdownloader/tests/test_integration.py | 4 +++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 67c481a..de1e921 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -335,7 +335,12 @@ class RedditDownloader: if destination.exists(): logger.warning(f'File already exists: {destination}') else: - res.download() + try: + res.download() + except errors.BulkDownloaderException: + logger.error( + f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}') + return if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: logger.warning(f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') else: diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index a93cc0c..af39554 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -32,6 +32,9 @@ class Resource: response = requests.get(url) if response.status_code == 200: return response.content + elif response.status_code in (301, 401, 403, 404): + logger.error(f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') + return None else: raise requests.exceptions.ConnectionError except requests.exceptions.ConnectionError: diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index dab80c8..600e983 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -22,6 +22,8 @@ from bulkredditdownloader.__main__ import cli ['-s', 'r/TrollXChromosomes/', '-L', 1], ['-s', 'TrollXChromosomes/', '-L', 1], ['-s', 'trollxchromosomes', '-L', 1], + ['-s', 'trollxchromosomes,mindustry,python', '-L', 1], + ['-s', 'trollxchromosomes, mindustry, python', '-L', 1], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day'], ['-s', 'trollxchromosomes', '-L', 1, '--sort', 'new'], ['-s', 'trollxchromosomes', '-L', 1, '--time', 'day', '--sort', 'new'], @@ -46,13 +48,13 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], ['-l', 'm3hxzd'], # Really long title used to overflow filename limit ['-l', 'm3kua3'], # Has a deleted user + ['-l', 'm5bqkf'], # Resource leading to a 404 )) def test_cli_download_links(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert len(list(tmp_path.iterdir())) == 1 @pytest.mark.online From 0d72bf6431d6962adf514fb41f358380a21d4b11 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 16 Mar 2021 10:01:51 +1000 Subject: [PATCH 142/276] Update documentation --- README.md | 32 ++++++++++------- docs/ARCHITECTURE.md | 51 +++++++++++++++++---------- docs/CODE_OF_CONDUCT.md | 76 +++++++++++++++++++++++++++++++++++++++++ docs/CONTRIBUTING.md | 29 ++++++++++++++++ 4 files changed, 157 insertions(+), 31 deletions(-) create mode 100644 docs/CODE_OF_CONDUCT.md diff --git a/README.md b/README.md index 68d92d7..8f694e7 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Bulk Downloader for Reddit -This is a tool to download data from Reddit. +This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. -# Usage +## Usage The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. @@ -10,7 +10,7 @@ There are two modes to the BDFR: download, and archive. Each one has a command t Many websites and links are supported for the downloader: - - Direct Links(links leading to a file) + - Direct Links (links leading to a file) - Erome - Gfycat - Gif Delivery Network @@ -21,7 +21,7 @@ Many websites and links are supported for the downloader: - Redgifs - Youtube -# Options +### Options The following options are common between both the `archive` and `download` commands of the BDFR. @@ -48,7 +48,7 @@ The following options are common between both the `archive` and `download` comma - `-L, --limit` - This is the limit on the number of submissions retrieve - Default is max possible - - Note that this limit applies to ** each source individually ** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped + - Note that this limit applies to **each source individually** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped - If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We cannot bypass this.** - `-S, --sort` - This is the sort type for each applicable submission source supplied to the BDFR @@ -87,7 +87,7 @@ The following options are common between both the `archive` and `download` comma - Increases the verbosity of the program - Can be specified multiple times -# Downloader Options +#### Downloader Options The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. @@ -111,7 +111,7 @@ The following options apply only to the `download` command. This command downloa - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded - Can be specified multiple times -# Archiver Options +#### Archiver Options The following options are for the `archive` command specifically. @@ -122,13 +122,13 @@ The following options are for the `archive` command specifically. - `xml` - `yaml` -# Authentication +## Authentication The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token - based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on. -# Changing Permissions +## Changing Permissions Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it is allowed by the BDFR. @@ -136,7 +136,7 @@ The configuration file for the BDFR contains the API secret and key, as well as For more details on the configuration file and the values therein, see[Configuration Files](#configuration). -# Folder and File Name Schemes +## Folder and File Name Schemes The naming and folder schemes for the BDFR are both completely customisable. A number of different fields can be given which will be replaced with properties from a submission when downloading it. The scheme format takes the form of `{KEY}`, where `KEY` is a string from the below list. @@ -150,15 +150,15 @@ The naming and folder schemes for the BDFR are both completely customisable. A n Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every downloaded post with the unique submission ID, you can use `{POSTID}`. Statis strings can also be included, such as `download_{POSTID}` which will not change from submission to submission. -At least one key * must * be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. +At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. -# Configuration +## Configuration The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be `C:\Documents and Settings\\Application Data\Local Settings\BDFR\bulkredditdownloader` or `C:\Documents and Settings\\Application Data\BDFR\bulkredditdownloader`. On Mac OSX, this will be `~/Library/Application Support/bulkredditdownloader`. Lastly, on a Linux system, this will be `~/.local/share/bulkredditdownloader`. The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report. -# Configuration File +### Configuration File The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys ** must ** be included in the configuration file supplied. @@ -167,3 +167,9 @@ The `config.cfg` is the file that supplies the BDFR with the configuration to us - `scopes` All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default. + +## Contributing + +If you wish to contribute, see [Contributing](docs/CONTRIBUTING.md) for more information. + +When reporting any issues or interacting with the developers, please follow the [Code of Conduct](docs/CODE_OF_CONDUCT.md). diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 8f1bb5e..9634cad 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,22 +1,37 @@ # Architecture - 1. Arguments are passed to an instance of RedditDownloader - 2. Internal objects are created - - - Formatter created - - Filter created - - Configuration loaded - - Reddit instance created - - 3. Reddit lists scraped +When the project was rewritten for v2, the goal was to make the codebase easily extensible and much easier to read and modify. However, this document provides a step-by-step look through the process that the BDFR goes through, so that any prospective developers can more easily grasp the way the code works. + +## The Download Process + +The BDFR is organised around a central object, the RedditDownloader class. The Archiver object extends and inherits from this class. + + 1. The RedditDownloader parses all the arguments and configuration options, held in the Configuration object, and creates a variety of internal objects for use, such as the file name formatter, download filter, etc. -To actually download, the following happens: + 2. The RedditDownloader scrapes raw submissions from Reddit via several methods relating to different sources. A source is defined as a single stream of submissions from a subreddit, multireddit, or user list. + + 3. These raw submissions are passed to the DownloaderFactory class to select the specialised downloader class to use. Each of these are for a specific website or link type, with some catch-all classes like Direct. + + 4. The BaseDownloader child, spawned by DownloaderFactory, takes the link and does any necessary processing to find the direct link to the actual resource. + + 5. This is returned to the RedditDownloader in the form of a Resource object. This holds the URL and some other information for the final resource. + + 6. The Resource is passed through the DownloadFilter instantiated in step 1. - 1. RedditDownloader uses DownloadFactory to find the right module for a submission - 2. Downloader instance created - 3. Downloader returns a list of Resource objects (lists may have one objects) - 4. RedditDownloader checks if it already exists - 5. RedditDownloader checks against the DownloadFilter created earlier - 6. RedditDownloader creates a formatted file path base on the Resource with FileNameFormatter - 7. Resource content is written to disk - \ No newline at end of file + 7. The destination file name for the Resource is calculated. If it already exists, then the Resource will be discarded. + + 8. Here the actual data is downloaded to the Resource and a hash calculated which is used to find duplicates. + + 9. Only then is the Resource written to the disk. + +This is the step-by-step process that the BDFR goes through to download a Reddit post. + +## Adding another Supported Site + +This is one of the easiest changes to do with the code. First, any new class must inherit from the BaseDownloader class which provided an abstract parent to implement. However, take note of the other classes as well. Many downloaders can inherit from one another instead of just the BaseDownloader. For example, the VReddit class, used for downloading video from Reddit, inherits almost all of its code from the YouTube class. **Minimise code duplication wherever possible**. + +Once the downloader class has been written **and tests added** for it as well, then the regex string for the site's URLs can be added to the DownloaderFactory. Then additional tests must be added for the DownloadFactory to ensure that the appropriate classes are called when the right URLs are passed to the factory. + +## Adding Other Features + + diff --git a/docs/CODE_OF_CONDUCT.md b/docs/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..26edfa9 --- /dev/null +++ b/docs/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of experience, +education, socio-economic status, nationality, personal appearance, race, +religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team via Discord. All complaints will +be reviewed and investigated and will result in a response that is deemed +necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an +incident. Further details of specific enforcement policies may be posted +separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + + diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index e69de29..dbbe8e2 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -0,0 +1,29 @@ +# Contributing + +When making a contribution to the BDFR project, please open an issue beforehand so that the maintainers can weigh in on it. This helps create a trail on GitHub and keeps things organised. + +If you have a question, **please don't open an issue on GitHub**. There is a subreddit specifically for the BDFR where questions can be asked. If you believe that something is a bug, or that a feature should be added, then by all means open an issue. + +All communication on GitHub, Discord, email, or any other medium must conform to the [Code of Conduct](CODE_OF_CONDUCT.md). It's not that hard to stay respectful. + +## Pull Requests + +Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) below before actually writing any code. + +Once you have done both of these, the below list shows the path that should be followed when writing a PR. + + 1. If an issue does not already exist, open one that will relate to the PR. + 2. Ensure that any changes fit into the architecture specified above. + 3. Ensure that you have written tests that cover the new code + 4. Ensure that no existing tests fail, unless there is a good reason for them to do so. If there is, note why in the PR. + 5. If needed, update any documentation with changes. + 6. Open a pull request that references the relevant issue. + 7. Expect changes or suggestions and heed the Code of Conduct. We're all volunteers here. + +Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR. + +## Style Guide + +The BDFR must conform to PEP8 standard wherever there is Python code, with one exception. Line lengths may extend to 120 characters, but all other PEP8 standards must be followed. + +It's easy to format your code without any manual work via a variety of tools. Autopep8 is a good one, and can be used with `autopep8 --max-line-length 120` which will format the code according to the style in use with the BDFR. From 99412156438ba29786e94d21d16cef7f91dc243d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 15:49:07 +1000 Subject: [PATCH 143/276] Fix index being added to single resources --- bulkredditdownloader/file_name_formatter.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 1950306..f36284e 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -68,9 +68,12 @@ class FileNameFormatter: def format_resource_paths(self, resources: list[Resource], destination_directory: Path) -> list[tuple[Path, Resource]]: out = [] - for i, res in enumerate(resources, start=1): - logger.log(9, f'Formatting filename with index {i}') - out.append((self.format_path(res, destination_directory, i), res)) + if len(resources) == 1: + out.append((self.format_path(resources[0], destination_directory, None), resources[0])) + else: + for i, res in enumerate(resources, start=1): + logger.log(9, f'Formatting filename with index {i}') + out.append((self.format_path(res, destination_directory, i), res)) return out @ staticmethod From 8eddaeaff46c6ed31521fdd126a5693e48dc170f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 15:51:42 +1000 Subject: [PATCH 144/276] Change logging message --- bulkredditdownloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index de1e921..e810086 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -351,7 +351,7 @@ class RedditDownloader: logger.debug(f'Written file to {destination}') self.master_hash_list.append(res.hash.hexdigest()) logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') - logger.info(f'Downloaded submission {submission.name}') + logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') @staticmethod def scan_existing_files(directory: Path) -> list[str]: From 29441e72446debebc69a351a7fe269cefebb335c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 16:10:36 +1000 Subject: [PATCH 145/276] Change Erome tests to valid links --- .../tests/downloaders/test_erome.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_erome.py b/bulkredditdownloader/tests/downloaders/test_erome.py index a0cba97..6c8d5e0 100644 --- a/bulkredditdownloader/tests/downloaders/test_erome.py +++ b/bulkredditdownloader/tests/downloaders/test_erome.py @@ -1,18 +1,19 @@ #!/usr/bin/env python3 # coding=utf-8 -from unittest.mock import Mock +from unittest.mock import MagicMock import pytest -from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.erome import Erome @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_urls'), ( - ('https://www.erome.com/a/hzLCb2c5', - ('https://s2.erome.com/353/hzLCb2c5/8FNh4qa8.jpg', 'https://s2.erome.com/353/hzLCb2c5/8FNh4qa8_480p.mp4') + ('https://www.erome.com/a/vqtPuLXh', ( + 'https://s6.erome.com/365/vqtPuLXh/KH2qBT99.jpg', + 'https://s6.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', + ) ), ('https://www.erome.com/a/ORhX0FZz', ('https://s4.erome.com/355/ORhX0FZz/9IYQocM9.jpg', @@ -39,12 +40,10 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_number_of_resources', 'expected_hashes'), ( - ('https://www.erome.com/a/hzLCb2c5', 2, - ('1b4b1703f81f2ad6a622f7319a4651c2', 'f24388a0f3443c1a27594e4af41c3e83') - ), + ('https://www.erome.com/a/vqtPuLXh', 2, ('5da2a8d60d87bed279431fdec8e7d72f', '243d17b52a728911b022829badbc524e')), )) def test_download_resource(test_url: str, expected_number_of_resources: int, expected_hashes: tuple[str]): - mock_submission = Mock + mock_submission = MagicMock() mock_submission.url = test_url test_site = Erome(mock_submission) resources = test_site.find_resources() From bc7ccc0964ac9cac770d7f787e08f070f2037534 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 16:23:00 +1000 Subject: [PATCH 146/276] Refactor Erome downloader --- .../site_downloaders/erome.py | 70 ++++++------------- 1 file changed, 21 insertions(+), 49 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 762e8f9..8dab973 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -2,16 +2,15 @@ import logging import re -import urllib.error -import urllib.request -from html.parser import HTMLParser from typing import Optional +import bs4 +import requests from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -22,63 +21,36 @@ class Erome(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - try: - images = set(self._get_links(self.post.url)) - except urllib.error.HTTPError: - raise NotADownloadableLinkError("Not a downloadable link") + images = self._get_links(self.post.url) + if not images: + raise NotADownloadableLinkError('Erome parser could not find any links') if len(images) == 1: - image = images.pop() - if not re.match(r'https?://.*', image): - image = "https://" + image + image = self._validate_url(image) return [Resource(self.post, image)] else: out = [] for i, image in enumerate(images): - if not re.match(r'https?://.*', image): - image = "https://" + image + image = self._validate_url(image) out.append(Resource(self.post, image)) return out @staticmethod - def _get_links(url: str) -> list[str]: - content = [] - line_number = None + def _validate_url(image): + if not re.match(r'https?://.*', image): + image = "https://" + image + return image - # TODO: move to bs4 and requests - class EromeParser(HTMLParser): - tag = None + @staticmethod + def _get_links(url: str) -> set[str]: + page = requests.get(url) + soup = bs4.BeautifulSoup(page.text) + front_images = soup.find_all('img', attrs={'class': 'img-front'}) + out = [im.get('src') for im in front_images] - def handle_starttag(self, tag, attrs): - self.tag = {tag: {attr[0]: attr[1] for attr in attrs}} + videos = soup.find_all('source') + out.extend([vid.get('src') for vid in videos]) - page_source = (urllib.request.urlopen(url).read().decode().split('\n')) - - """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS""" - for i in range(len(page_source)): - obj = EromeParser() - obj.feed(page_source[i]) - tag = obj.tag - - if tag is not None: - if "div" in tag: - if "id" in tag["div"]: - if tag["div"]["id"] == "album": - line_number = i - break - - for line in page_source[line_number:]: - obj = EromeParser() - obj.feed(line) - tag = obj.tag - if tag is not None: - if "img" in tag: - if "class" in tag["img"]: - if tag["img"]["class"] == "img-front": - content.append(tag["img"]["src"]) - elif "source" in tag: - content.append(tag["source"]["src"]) - - return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")] + return set(out) From 3e9a846e2e5ecc36e049e32c57a8032ffd9d1282 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 16:27:26 +1000 Subject: [PATCH 147/276] Refactor Reddit Gallery tests --- .../tests/downloaders/test_gallery.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index 2b29a4b..222bb52 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -1,22 +1,22 @@ #!/usr/bin/env python3 # coding=utf-8 -import praw.models +import praw import pytest from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gallery import Gallery -@pytest.fixture() -def reddit_submission(reddit_instance) -> praw.models.Submission: - return reddit_instance.submission(id='ljyy27') - - @pytest.mark.online @pytest.mark.reddit -def test_gallery(reddit_submission: praw.models.Submission): - gallery = Gallery(reddit_submission) +@pytest.mark.parametrize(('test_submission_id', 'expected_len'), ( + ('ljyy27', 4), + ('m6lvrh', 4), +)) +def test_gallery(test_submission_id: str, expected_len: int, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + gallery = Gallery(test_submission) results = gallery.find_resources() - assert len(results) == 4 + assert len(results) == expected_len assert all([isinstance(result, Resource) for result in results]) From ed26907e0d75c4cfe2a6d8c932f8391d6627e3f8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 16:58:29 +1000 Subject: [PATCH 148/276] Refactor Gallery downloader --- .../site_downloaders/gallery.py | 59 ++++++------------- .../tests/downloaders/test_gallery.py | 2 +- 2 files changed, 18 insertions(+), 43 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 449d853..2ba4af2 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -1,15 +1,16 @@ #!/usr/bin/env python3 -import json import logging +import re from typing import Optional +import bs4 import requests from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound +from bulkredditdownloader.exceptions import ResourceNotFound from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) @@ -18,51 +19,25 @@ logger = logging.getLogger(__name__) class Gallery(BaseDownloader): def __init__(self, post: Submission): super().__init__(post) - link = self.post.url - self.raw_data = self._get_data(link) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - images = {} - count = 0 - for model in self.raw_data['posts']['models']: - try: - for item in self.raw_data['posts']['models'][model]['media']['gallery']['items']: - try: - images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts'] - ['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']} - count += 1 - except KeyError: - continue - except KeyError: - continue - - return self._download_album(images) + image_urls = self._get_links(self.post.url) + if not image_urls: + raise ResourceNotFound('No images found in Reddit gallery') + return [Resource(self.post, url) for url in image_urls] @staticmethod - def _get_data(link: str) -> dict: - headers = { + def _get_links(url: str) -> list[str]: + page = requests.get(url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", } - res = requests.get(link, headers=headers) - if res.status_code != 200: - raise ResourceNotFound(f"Server responded with {res.status_code} to {link}") - page_source = res.text + ) + soup = bs4.BeautifulSoup(page.text) - starting_string = "_r = {" - ending_string = "" - - starting_string_lenght = len(starting_string) - try: - start_index = page_source.index(starting_string) + starting_string_lenght - end_index = page_source.index(ending_string, start_index) - except ValueError: - raise NotADownloadableLinkError(f"Could not read the page source on {link}") - - data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1]) - return data - - def _download_album(self, images: dict): - out = [Resource(self.post, images[image_key]['url']) for image_key in images.keys()] - return out + links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) + links = [link.get('href') for link in links] + pattern = re.compile(r'(.*?)\?.*$') + links = [re.search(pattern, link).group(1) for link in links] + return links diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index 222bb52..9694964 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -11,8 +11,8 @@ from bulkredditdownloader.site_downloaders.gallery import Gallery @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected_len'), ( - ('ljyy27', 4), ('m6lvrh', 4), + ('ljyy27', 4), )) def test_gallery(test_submission_id: str, expected_len: int, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From 423155a846923c377ff7eac192a8e4358fb6d0f2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 20:42:35 +1000 Subject: [PATCH 149/276] Add some more Gallery tests --- .../tests/downloaders/test_gallery.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index 9694964..49e3e83 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -8,6 +8,26 @@ from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gallery import Gallery +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('https://www.reddit.com/gallery/m6lvrh', { + 'https://preview.redd.it/18nzv9ch0hn61.jpg', + 'https://preview.redd.it/jqkizcch0hn61.jpg', + 'https://preview.redd.it/k0fnqzbh0hn61.jpg', + 'https://preview.redd.it/m3gamzbh0hn61.jpg' + }), + ('https://www.reddit.com/gallery/ljyy27', { + 'https://preview.redd.it/04vxj25uqih61.png', + 'https://preview.redd.it/0fnx83kpqih61.png', + 'https://preview.redd.it/7zkmr1wqqih61.png', + 'https://preview.redd.it/u37k5gxrqih61.png' + }), +)) +def test_gallery_get_links(test_url: str, expected: set[str]): + results = Gallery._get_links(test_url) + assert set(results) == expected + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected_len'), ( From a54a1c6dfa53fd6eaf568d0a444e335a39ab0e33 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 21:02:25 +1000 Subject: [PATCH 150/276] Update GifDeliveryNetwork tests --- .../downloaders/test_gif_delivery_network.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py b/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py index e790fa7..e4bec87 100644 --- a/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py +++ b/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py @@ -11,10 +11,12 @@ from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDelive @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected'), ( - ('https://www.gifdeliverynetwork.com/handyunsightlydesertpupfish', - 'https://thumbs2.redgifs.com/HandyUnsightlyDesertpupfish.mp4'), - ('https://www.gifdeliverynetwork.com/lamelikelyhamadryad', - 'https://thumbs2.redgifs.com/LameLikelyHamadryad.mp4') + ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', + 'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'), + ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', + 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), + ('https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4', + 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), )) def test_get_link(test_url: str, expected: str): result = GifDeliveryNetwork._get_link(test_url) @@ -23,11 +25,11 @@ def test_get_link(test_url: str, expected: str): @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.gifdeliverynetwork.com/handyunsightlydesertpupfish', 'd941460dcf4e0d09dd33abaa32e2d270'), - ('https://www.gifdeliverynetwork.com/lamelikelyhamadryad', '4806fe15f4991bb73581338793488daf'), + ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', '9bec0a9e4163a43781368ed5d70471df'), + ('https://www.gifdeliverynetwork.com/regalshoddyhorsechestnutleafminer', '8afb4e2c090a87140230f2352bf8beba'), )) def test_download_resource(test_url: str, expected_hash: str): - mock_submission = Mock + mock_submission = Mock() mock_submission.url = test_url test_site = GifDeliveryNetwork(mock_submission) resources = test_site.find_resources() From f13e029e478c9fcd1f241ec6d580cc6e917dd273 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 21:04:37 +1000 Subject: [PATCH 151/276] Refactor downloader --- .../site_downloaders/gif_delivery_network.py | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index de627a7..15ee76f 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import re from typing import Optional import requests @@ -17,30 +18,20 @@ class GifDeliveryNetwork(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - try: - media_url = self._get_link(self.post.url) - except IndexError: - raise NotADownloadableLinkError("Could not read the page source") - + media_url = self._get_link(self.post.url) return [Resource(self.post, media_url, '.mp4')] @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source and return it""" - if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]: + if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url): return url - if url[-1:] == '/': - url = url[:-1] - - url = "https://www.gifdeliverynetwork.com/" + url.split('/')[-1] page_source = requests.get(url).text - soup = BeautifulSoup(page_source, "html.parser") - attributes = {"id": "mp4Source", "type": "video/mp4"} - content = soup.find("source", attrs=attributes) + soup = BeautifulSoup(page_source, 'html.parser') + content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) - if content is None: - raise NotADownloadableLinkError("Could not read the page source") + if content is None or content.get('src') is None: + raise NotADownloadableLinkError('Could not read the page source') - return content["src"] + return content.get('src') From a9028434c40d4bde428b175640ba0ebd6a9756ce Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 Mar 2021 22:28:31 +1000 Subject: [PATCH 152/276] Generalise test --- bulkredditdownloader/tests/test_downloader.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index d4a2e03..cf5c4fa 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -271,17 +271,25 @@ def test_get_user_saved(downloader_mock: MagicMock, authenticated_reddit_instanc @pytest.mark.online @pytest.mark.reddit -def test_download_submission(downloader_mock: MagicMock, reddit_instance: praw.Reddit, tmp_path: Path): +@pytest.mark.parametrize(('test_submission_id', 'expected_files_len'), ( + ('ljyy27', 4), +)) +def test_download_submission( + test_submission_id: str, + expected_files_len: int, + downloader_mock: MagicMock, + reddit_instance: praw.Reddit, + tmp_path: Path): downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True downloader_mock.args.set_folder_scheme = '' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path downloader_mock.master_hash_list = [] - submission = downloader_mock.reddit_instance.submission(id='ljyy27') + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) - assert len(folder_contents) == 4 + assert len(folder_contents) == expected_files_len @pytest.mark.online From 0929469befeb78a128933e0248d893560cc2866c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 18 Mar 2021 11:01:34 +1000 Subject: [PATCH 153/276] Fix Gallery --- .../site_downloaders/gallery.py | 2 - .../tests/downloaders/test_gallery.py | 44 ++++++++++++------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 2ba4af2..814564d 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -38,6 +38,4 @@ class Gallery(BaseDownloader): links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) links = [link.get('href') for link in links] - pattern = re.compile(r'(.*?)\?.*$') - links = [re.search(pattern, link).group(1) for link in links] return links diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/downloaders/test_gallery.py index 49e3e83..93326a0 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/downloaders/test_gallery.py @@ -4,23 +4,30 @@ import praw import pytest -from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_downloaders.gallery import Gallery @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected'), ( ('https://www.reddit.com/gallery/m6lvrh', { - 'https://preview.redd.it/18nzv9ch0hn61.jpg', - 'https://preview.redd.it/jqkizcch0hn61.jpg', - 'https://preview.redd.it/k0fnqzbh0hn61.jpg', - 'https://preview.redd.it/m3gamzbh0hn61.jpg' + 'https://preview.redd.it/18nzv9ch0hn61.jpg?width=4160&' + 'format=pjpg&auto=webp&s=470a825b9c364e0eace0036882dcff926f821de8', + 'https://preview.redd.it/jqkizcch0hn61.jpg?width=4160&' + 'format=pjpg&auto=webp&s=ae4f552a18066bb6727676b14f2451c5feecf805', + 'https://preview.redd.it/k0fnqzbh0hn61.jpg?width=4160&' + 'format=pjpg&auto=webp&s=c6a10fececdc33983487c16ad02219fd3fc6cd76', + 'https://preview.redd.it/m3gamzbh0hn61.jpg?width=4160&' + 'format=pjpg&auto=webp&s=0dd90f324711851953e24873290b7f29ec73c444' }), ('https://www.reddit.com/gallery/ljyy27', { - 'https://preview.redd.it/04vxj25uqih61.png', - 'https://preview.redd.it/0fnx83kpqih61.png', - 'https://preview.redd.it/7zkmr1wqqih61.png', - 'https://preview.redd.it/u37k5gxrqih61.png' + 'https://preview.redd.it/04vxj25uqih61.png?width=92&' + 'format=png&auto=webp&s=6513f3a5c5128ee7680d402cab5ea4fb2bbeead4', + 'https://preview.redd.it/0fnx83kpqih61.png?width=241&' + 'format=png&auto=webp&s=655e9deb6f499c9ba1476eaff56787a697e6255a', + 'https://preview.redd.it/7zkmr1wqqih61.png?width=237&' + 'format=png&auto=webp&s=19de214e634cbcad9959f19570c616e29be0c0b0', + 'https://preview.redd.it/u37k5gxrqih61.png?width=443&' + 'format=png&auto=webp&s=e74dae31841fe4a2545ffd794d3b25b9ff0eb862' }), )) def test_gallery_get_links(test_url: str, expected: set[str]): @@ -30,13 +37,20 @@ def test_gallery_get_links(test_url: str, expected: set[str]): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_submission_id', 'expected_len'), ( - ('m6lvrh', 4), - ('ljyy27', 4), +@pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), ( + ('m6lvrh', {'6c8a892ae8066cbe119218bcaac731e1', + '93ce177f8cb7994906795f4615114d13', + '9a293adf19354f14582608cf22124574', + 'b73e2c3daee02f99404644ea02f1ae65'}), + ('ljyy27', {'1bc38bed88f9c4770e22a37122d5c941', + '2539a92b78f3968a069df2dffe2279f9', + '37dea50281c219b905e46edeefc1a18d', + 'ec4924cf40549728dcf53dd40bc7a73c'}), )) -def test_gallery(test_submission_id: str, expected_len: int, reddit_instance: praw.Reddit): +def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) gallery = Gallery(test_submission) results = gallery.find_resources() - assert len(results) == expected_len - assert all([isinstance(result, Resource) for result in results]) + [res.download() for res in results] + hashes = [res.hash.hexdigest() for res in results] + assert set(hashes) == expected_hashes From 540b237da6bd0b9c04489d109355d774715e93bc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 18 Mar 2021 19:10:27 +1000 Subject: [PATCH 154/276] Fix Erome downloader --- .../site_downloaders/erome.py | 22 +++++++++---------- .../tests/downloaders/test_erome.py | 21 +++++++++++++----- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 8dab973..bb4373b 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -21,20 +21,20 @@ class Erome(BaseDownloader): super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - images = self._get_links(self.post.url) - if not images: + links = self._get_links(self.post.url) + if not links: raise NotADownloadableLinkError('Erome parser could not find any links') - if len(images) == 1: - image = images.pop() - image = self._validate_url(image) - return [Resource(self.post, image)] + if len(links) == 1: + link = links.pop() + link = self._validate_url(link) + return [Resource(self.post, link)] else: out = [] - for i, image in enumerate(images): - image = self._validate_url(image) - out.append(Resource(self.post, image)) + for i, link in enumerate(links): + link = self._validate_url(link) + out.append(Resource(self.post, link)) return out @staticmethod @@ -47,8 +47,8 @@ class Erome(BaseDownloader): def _get_links(url: str) -> set[str]: page = requests.get(url) soup = bs4.BeautifulSoup(page.text) - front_images = soup.find_all('img', attrs={'class': 'img-front'}) - out = [im.get('src') for im in front_images] + front_images = soup.find_all('img', attrs={'class': 'lasyload'}) + out = [im.get('data-src') for im in front_images] videos = soup.find_all('source') out.extend([vid.get('src') for vid in videos]) diff --git a/bulkredditdownloader/tests/downloaders/test_erome.py b/bulkredditdownloader/tests/downloaders/test_erome.py index 6c8d5e0..b6d1d54 100644 --- a/bulkredditdownloader/tests/downloaders/test_erome.py +++ b/bulkredditdownloader/tests/downloaders/test_erome.py @@ -14,7 +14,7 @@ from bulkredditdownloader.site_downloaders.erome import Erome 'https://s6.erome.com/365/vqtPuLXh/KH2qBT99.jpg', 'https://s6.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', ) - ), + ), ('https://www.erome.com/a/ORhX0FZz', ('https://s4.erome.com/355/ORhX0FZz/9IYQocM9.jpg', 'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', @@ -39,15 +39,24 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): @pytest.mark.online @pytest.mark.slow -@pytest.mark.parametrize(('test_url', 'expected_number_of_resources', 'expected_hashes'), ( - ('https://www.erome.com/a/vqtPuLXh', 2, ('5da2a8d60d87bed279431fdec8e7d72f', '243d17b52a728911b022829badbc524e')), +@pytest.mark.parametrize(('test_url', 'expected_hashes'), ( + ('https://www.erome.com/a/vqtPuLXh', {'5da2a8d60d87bed279431fdec8e7d72f'}), + ('https://www.erome.com/i/ItASD33e', {'b0d73fedc9ce6995c2f2c4fdb6f11eff'}), + ('https://www.erome.com/a/lGrcFxmb', { + '0e98f9f527a911dcedde4f846bb5b69f', + '25696ae364750a5303fc7d7dc78b35c1', + '63775689f438bd393cde7db6d46187de', + 'a1abf398cfd4ef9cfaf093ceb10c746a', + 'bd9e1a4ea5ef0d6ba47fb90e337c2d14' + }), )) -def test_download_resource(test_url: str, expected_number_of_resources: int, expected_hashes: tuple[str]): +def test_download_resource(test_url: str, expected_hashes: tuple[str]): + # Can't compare hashes for this test, Erome doesn't return the exact same file from request to request so the hash + # will change back and forth randomly mock_submission = MagicMock() mock_submission.url = test_url test_site = Erome(mock_submission) resources = test_site.find_resources() - assert len(resources) == expected_number_of_resources [res.download() for res in resources] resource_hashes = [res.hash.hexdigest() for res in resources] - assert set(resource_hashes) == set(expected_hashes) + assert len(resource_hashes) == len(expected_hashes) From 03c8d524a4b767ceb4b18522ab95ed5b86082697 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 19 Mar 2021 22:13:56 +1000 Subject: [PATCH 155/276] Refactor Gfycat class --- .../site_downloaders/gfycat.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index af1f45b..8eaf03b 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -22,22 +22,15 @@ class Gfycat(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source and return it """ if re.match(r'\.(webm|mp4|gif)$', url): return url - if url.endswith('/'): - url = url[:-1] - - url = "https://gfycat.com/" + url.split('/')[-1] + gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) + url = 'https://gfycat.com/' + gfycat_id page_source = requests.get(url).text + soup = BeautifulSoup(page_source, 'html.parser') + content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) - soup = BeautifulSoup(page_source, "html.parser") - attributes = {"data-react-helmet": "true", "type": "application/ld+json"} - content = soup.find("script", attrs=attributes) - - if content is None: - return super()._get_link(url) - - return json.loads(content.contents[0])["video"]["contentUrl"] + out = json.loads(content.contents[0]).get('video').get('contentUrl') + return out From 6a1e652628189c943ea33d59dcff738d06d45217 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 19 Mar 2021 22:28:41 +1000 Subject: [PATCH 156/276] Refactor redgifs --- .../site_downloaders/redgifs.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 426378a..e4ee567 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import json +import re from typing import Optional import requests @@ -22,14 +23,11 @@ class Redgifs(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - """Extract direct link to the video from page's source and return it""" - if '.webm' in url or '.mp4' in url or '.gif' in url: + if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url): return url - if url[-1:] == '/': - url = url[:-1] - - url = "https://redgifs.com/watch/" + url.split('/')[-1] + redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) + url = 'https://redgifs.com/watch/' + redgif_id headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' @@ -38,11 +36,11 @@ class Redgifs(GifDeliveryNetwork): page_source = requests.get(url, headers=headers).text - soup = BeautifulSoup(page_source, "html.parser") - attributes = {"data-react-helmet": "true", "type": "application/ld+json"} - content = soup.find("script", attrs=attributes) + soup = BeautifulSoup(page_source, 'html.parser') + content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) if content is None: - raise NotADownloadableLinkError("Could not read the page source") + raise NotADownloadableLinkError('Could not read the page source') - return json.loads(content.contents[0])["video"]["contentUrl"] + out = json.loads(content.contents[0])['video']['contentUrl'] + return out From 902f796178d14b138bfa2b39795f91877b69a2b2 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 19 Mar 2021 22:49:25 +1000 Subject: [PATCH 157/276] Refactor Erome class --- .../site_downloaders/erome.py | 23 +++++-------------- .../tests/downloaders/test_erome.py | 15 ++---------- 2 files changed, 8 insertions(+), 30 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index bb4373b..6d6e76d 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -25,23 +25,12 @@ class Erome(BaseDownloader): if not links: raise NotADownloadableLinkError('Erome parser could not find any links') - if len(links) == 1: - link = links.pop() - link = self._validate_url(link) - return [Resource(self.post, link)] - - else: - out = [] - for i, link in enumerate(links): - link = self._validate_url(link) - out.append(Resource(self.post, link)) - return out - - @staticmethod - def _validate_url(image): - if not re.match(r'https?://.*', image): - image = "https://" + image - return image + out = [] + for link in links: + if not re.match(r'https?://.*', link): + link = 'https://' + link + out.append(Resource(self.post, link)) + return out @staticmethod def _get_links(url: str) -> set[str]: diff --git a/bulkredditdownloader/tests/downloaders/test_erome.py b/bulkredditdownloader/tests/downloaders/test_erome.py index b6d1d54..9500cf8 100644 --- a/bulkredditdownloader/tests/downloaders/test_erome.py +++ b/bulkredditdownloader/tests/downloaders/test_erome.py @@ -10,25 +10,14 @@ from bulkredditdownloader.site_downloaders.erome import Erome @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_urls'), ( - ('https://www.erome.com/a/vqtPuLXh', ( - 'https://s6.erome.com/365/vqtPuLXh/KH2qBT99.jpg', - 'https://s6.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', - ) - ), + ('https://www.erome.com/a/vqtPuLXh', ('https://s6.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',)), ('https://www.erome.com/a/ORhX0FZz', - ('https://s4.erome.com/355/ORhX0FZz/9IYQocM9.jpg', - 'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm.jpg', + ('https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp.jpg', 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/LruobtMs.jpg', 'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5.jpg', 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z.jpg', 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7.jpg', 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4') ), )) From 2b885451e79c7d09c29729601ecbe51c0e61f862 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 20 Mar 2021 12:03:53 +1000 Subject: [PATCH 158/276] Add ability to make hard links for duplicates --- bulkredditdownloader/__main__.py | 1 + bulkredditdownloader/configuration.py | 1 + bulkredditdownloader/downloader.py | 39 ++++++++------- bulkredditdownloader/tests/test_downloader.py | 47 ++++++++++++++----- 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 6d24303..1e40bb0 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -43,6 +43,7 @@ def cli(): @cli.command('download') +@click.option('--make-hard-links', is_flag=True, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) @click.option('--set-file-scheme', default=None, type=str) diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 09d1b8a..7c298b4 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -31,6 +31,7 @@ class Configuration(Namespace): self.upvoted: bool = False self.user: Optional[str] = None self.verbose: int = 0 + self.make_hard_links = False # Archiver-specific options self.format = 'json' diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index e810086..dc9c301 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -74,9 +74,10 @@ class RedditDownloader: self._create_reddit_instance() self._resolve_user_name() - self.master_hash_list = [] if self.args.search_existing: - self.master_hash_list.extend(self.scan_existing_files(self.download_directory)) + self.master_hash_list = self.scan_existing_files(self.download_directory) + else: + self.master_hash_list = {} self.authenticator = self._create_authenticator() logger.log(9, 'Created site authenticator') @@ -341,27 +342,33 @@ class RedditDownloader: logger.error( f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}') return - if res.hash.hexdigest() in self.master_hash_list and self.args.no_dupes: - logger.warning(f'Resource from "{res.url}" and hash "{res.hash.hexdigest()}" downloaded elsewhere') - else: - # TODO: consider making a hard link/symlink here - destination.parent.mkdir(parents=True, exist_ok=True) - with open(destination, 'wb') as file: - file.write(res.content) - logger.debug(f'Written file to {destination}') - self.master_hash_list.append(res.hash.hexdigest()) - logger.debug(f'Hash added to master list: {res.hash.hexdigest()}') - logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') + resource_hash = res.hash.hexdigest() + if resource_hash in self.master_hash_list: + if self.args.no_dupes: + logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere') + return + elif self.args.make_hard_links: + self.master_hash_list[resource_hash].link_to(destination) + logger.debug( + f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}') + return + destination.parent.mkdir(parents=True, exist_ok=True) + with open(destination, 'wb') as file: + file.write(res.content) + logger.debug(f'Written file to {destination}') + self.master_hash_list[resource_hash] = destination + logger.debug(f'Hash added to master list: {resource_hash}') + logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}') @staticmethod - def scan_existing_files(directory: Path) -> list[str]: + def scan_existing_files(directory: Path) -> dict[str, Path]: files = [] for (dirpath, dirnames, filenames) in os.walk(directory): files.extend([Path(dirpath, file) for file in filenames]) logger.info(f'Calculating hashes for {len(files)} files') - hash_list = [] + hash_list = {} for existing_file in files: with open(existing_file, 'rb') as file: - hash_list.append(hashlib.md5(file.read()).hexdigest()) + hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file logger.log(9, f'Hash calculated for file at {existing_file}') return hash_list diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index cf5c4fa..e263a5d 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -27,12 +27,13 @@ def args() -> Configuration: @pytest.fixture() -def downloader_mock(args: argparse.Namespace): - mock_downloader = MagicMock() - mock_downloader.args = args - mock_downloader._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name - mock_downloader._split_args_input = RedditDownloader._split_args_input - return mock_downloader +def downloader_mock(args: Configuration): + downloader_mock = MagicMock() + downloader_mock.args = args + downloader_mock._sanitise_subreddit_name = RedditDownloader._sanitise_subreddit_name + downloader_mock._split_args_input = RedditDownloader._split_args_input + downloader_mock.master_hash_list = {} + return downloader_mock def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]): @@ -285,7 +286,6 @@ def test_download_submission( downloader_mock.args.set_folder_scheme = '' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = [] submission = downloader_mock.reddit_instance.submission(id=test_submission_id) RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) @@ -305,9 +305,8 @@ def test_download_submission_file_exists( downloader_mock.args.set_folder_scheme = '' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = [] submission = downloader_mock.reddit_instance.submission(id='m1hqw6') - Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6_1.png').touch() + Path(tmp_path, 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png').touch() RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) output = capsys.readouterr() @@ -329,7 +328,7 @@ def test_download_submission_hash_exists( downloader_mock.args.no_dupes = True downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = ['a912af8905ae468e0121e9940f797ad7'] + downloader_mock.master_hash_list = {'a912af8905ae468e0121e9940f797ad7': None} submission = downloader_mock.reddit_instance.submission(id='m1hqw6') RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) @@ -356,8 +355,7 @@ def test_sanitise_subreddit_name(test_name: str, expected: str): def test_search_existing_files(): results = RedditDownloader.scan_existing_files(Path('.')) - assert all([isinstance(result, str) for result in results]) - assert len(results) >= 40 + assert len(results.keys()) >= 40 @pytest.mark.parametrize(('test_subreddit_entries', 'expected'), ( @@ -370,3 +368,28 @@ def test_search_existing_files(): def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: set[str]): results = RedditDownloader._split_args_input(test_subreddit_entries) assert results == expected + + +@pytest.mark.online +@pytest.mark.reddit +def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_instance: praw.Reddit): + downloader_mock.reddit_instance = reddit_instance + downloader_mock.args.make_hard_links = True + downloader_mock.download_directory = tmp_path + downloader_mock.args.set_folder_scheme = '' + downloader_mock.args.set_file_scheme = '{POSTID}' + downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) + submission = downloader_mock.reddit_instance.submission(id='m1hqw6') + original = Path(tmp_path, 'm1hqw6.png') + + RedditDownloader._download_submission(downloader_mock, submission) + assert original.exists() + + downloader_mock.args.set_file_scheme = 'test2_{POSTID}' + downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) + RedditDownloader._download_submission(downloader_mock, submission) + test_file_1_stats = original.stat() + test_file_2_inode = Path(tmp_path, 'test2_m1hqw6.png').stat().st_ino + + assert test_file_1_stats.st_nlink == 2 + assert test_file_1_stats.st_ino == test_file_2_inode From c9d2a23a5f96b643ce3386c7e5891fdd5daae6e9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 20 Mar 2021 14:01:36 +1000 Subject: [PATCH 159/276] Update README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 8f694e7..90fed77 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,9 @@ The following options are common between both the `archive` and `download` comma The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. +- `--make-hard-links` + - This flag will create hard links to an existing file when a duplicate is downloaded + - This will make the file appear in multiple directories while only taking the space of a single instance - `--no-dupes` - This flag will not redownload files if they already exist somewhere in the root folder tree - This is calculated by MD5 hash From ba2ab25c2c4dce46618a8c7b4e60c5773d902a37 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 20 Mar 2021 14:05:07 +1000 Subject: [PATCH 160/276] Rename options --- README.md | 4 ++-- bulkredditdownloader/__main__.py | 4 ++-- bulkredditdownloader/configuration.py | 4 ++-- bulkredditdownloader/downloader.py | 2 +- bulkredditdownloader/tests/test_downloader.py | 20 +++++++++---------- .../tests/test_integration.py | 2 +- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 90fed77..7c825d5 100644 --- a/README.md +++ b/README.md @@ -99,11 +99,11 @@ The following options apply only to the `download` command. This command downloa - This is calculated by MD5 hash - `--search-existing` - This will make the BDFR compile the hashes for every file in `directory` and store them to remove duplicates if `--no-dupes` is also supplied -- `--set-file-scheme` +- `--file-scheme` - Sets the scheme for files - Default is `{REDDITOR}_{TITLE}_{POSTID}` - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details -- `--set-folder-scheme` +- `--folder-scheme` - Sets the scheme for folders - Default is `{SUBREDDIT}` - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 1e40bb0..3d87a72 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -46,8 +46,8 @@ def cli(): @click.option('--make-hard-links', is_flag=True, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) -@click.option('--set-file-scheme', default=None, type=str) -@click.option('--set-folder-scheme', default=None, type=str) +@click.option('--file-scheme', default=None, type=str) +@click.option('--folder-scheme', default=None, type=str) @click.option('--skip', default=None, multiple=True) @click.option('--skip-domain', default=None, multiple=True) @_add_common_options diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 7c298b4..1227590 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -20,8 +20,8 @@ class Configuration(Namespace): self.saved: bool = False self.search: Optional[str] = None self.search_existing: bool = False - self.set_file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' - self.set_folder_scheme: str = '{SUBREDDIT}' + self.file_scheme: str = '{REDDITOR}_{TITLE}_{POSTID}' + self.folder_scheme: str = '{SUBREDDIT}' self.skip: list[str] = [] self.skip_domain: list[str] = [] self.sort: str = 'hot' diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index dc9c301..f536adb 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -289,7 +289,7 @@ class RedditDownloader: return True def _create_file_name_formatter(self) -> FileNameFormatter: - return FileNameFormatter(self.args.set_file_scheme, self.args.set_folder_scheme) + return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme) def _create_time_filter(self) -> RedditTypes.TimeType: try: diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index e263a5d..9b05c9e 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -101,8 +101,8 @@ def test_create_sort_filter(test_sort: str, expected: str, downloader_mock: Magi ('{POSTID}', ''), )) def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): - downloader_mock.args.set_file_scheme = test_file_scheme - downloader_mock.args.set_folder_scheme = test_folder_scheme + downloader_mock.args.file_scheme = test_file_scheme + downloader_mock.args.folder_scheme = test_folder_scheme result = RedditDownloader._create_file_name_formatter(downloader_mock) assert isinstance(result, FileNameFormatter) @@ -116,8 +116,8 @@ def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: s ('test', '{SUBREDDIT}'), )) def test_create_file_name_formatter_bad(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): - downloader_mock.args.set_file_scheme = test_file_scheme - downloader_mock.args.set_folder_scheme = test_folder_scheme + downloader_mock.args.file_scheme = test_file_scheme + downloader_mock.args.folder_scheme = test_folder_scheme with pytest.raises(BulkDownloaderException): RedditDownloader._create_file_name_formatter(downloader_mock) @@ -283,7 +283,7 @@ def test_download_submission( tmp_path: Path): downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True - downloader_mock.args.set_folder_scheme = '' + downloader_mock.args.folder_scheme = '' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path submission = downloader_mock.reddit_instance.submission(id=test_submission_id) @@ -302,7 +302,7 @@ def test_download_submission_file_exists( setup_logging(3) downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True - downloader_mock.args.set_folder_scheme = '' + downloader_mock.args.folder_scheme = '' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path submission = downloader_mock.reddit_instance.submission(id='m1hqw6') @@ -324,7 +324,7 @@ def test_download_submission_hash_exists( setup_logging(3) downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True - downloader_mock.args.set_folder_scheme = '' + downloader_mock.args.folder_scheme = '' downloader_mock.args.no_dupes = True downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path @@ -376,8 +376,8 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta downloader_mock.reddit_instance = reddit_instance downloader_mock.args.make_hard_links = True downloader_mock.download_directory = tmp_path - downloader_mock.args.set_folder_scheme = '' - downloader_mock.args.set_file_scheme = '{POSTID}' + downloader_mock.args.folder_scheme = '' + downloader_mock.args.file_scheme = '{POSTID}' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) submission = downloader_mock.reddit_instance.submission(id='m1hqw6') original = Path(tmp_path, 'm1hqw6.png') @@ -385,7 +385,7 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta RedditDownloader._download_submission(downloader_mock, submission) assert original.exists() - downloader_mock.args.set_file_scheme = 'test2_{POSTID}' + downloader_mock.args.file_scheme = 'test2_{POSTID}' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) RedditDownloader._download_submission(downloader_mock, submission) test_file_1_stats = original.stat() diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index 600e983..a50d7f7 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -115,7 +115,7 @@ def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path): @pytest.mark.authenticated @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( - ['--user', 'me', '-L', 10, '--set-folder-scheme', ''], + ['--user', 'me', '-L', 10, '--folder-scheme', ''], )) def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp_path: Path): runner = CliRunner() From 1215bc69de45a0ccb9329b847c16d5b07289ed97 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Mar 2021 11:10:06 +1000 Subject: [PATCH 161/276] Refactor Imgur class to be hardier --- .../site_downloaders/imgur.py | 98 +++++++------------ 1 file changed, 36 insertions(+), 62 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index af09c3f..73abba5 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -1,99 +1,73 @@ #!/usr/bin/env python3 import json -import logging +import re from typing import Optional +import bs4 import requests from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.site_downloaders.direct import Direct - -logger = logging.getLogger(__name__) class Imgur(BaseDownloader): - imgur_image_domain = "https://i.imgur.com/" def __init__(self, post: Submission): super().__init__(post) self.raw_data = {} def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - link = self.post.url - - if link.endswith(".gifv"): - direct_thing = Direct(self.post) - return direct_thing.find_resources(authenticator) - - self.raw_data = self._get_data(link) - - if self._is_album(): - if self.raw_data["album_images"]["count"] != 1: - out = self._download_album(self.raw_data["album_images"]) - else: - out = self._download_image(self.raw_data["album_images"]["images"][0]) - else: - out = self._download_image(self.raw_data) - return out - - def _download_album(self, images: dict): - images_length = images["count"] + self.raw_data = self._get_data(self.post.url) out = [] - - for i in range(images_length): - extension = self._validate_extension(images["images"][i]["ext"]) - image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension - out.append(Resource(self.post, image_url)) + if 'album_images' in self.raw_data: + images = self.raw_data['album_images'] + for image in images['images']: + out.append(self._download_image(image)) + else: + out.append(self._download_image(self.raw_data)) return out - def _download_image(self, image: dict): - extension = self._validate_extension(image["ext"]) - image_url = self.imgur_image_domain + image["hash"] + extension - return [Resource(self.post, image_url)] - - def _is_album(self) -> bool: - return "album_images" in self.raw_data + def _download_image(self, image: dict) -> Resource: + image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext']) + return Resource(self.post, image_url) @staticmethod def _get_data(link: str) -> dict: - cookies = {"over18": "1", "postpagebeta": "0"} - res = requests.get(link, cookies=cookies) + res = requests.get(link, cookies={'over18': '1', 'postpagebeta': '0'}) + if res.status_code != 200: - raise ResourceNotFound(f"Server responded with {res.status_code} to {link}") - page_source = requests.get(link, cookies=cookies).text + raise ResourceNotFound(f'Server responded with {res.status_code} to {link}') - starting_string = "image : " - ending_string = "group :" + soup = bs4.BeautifulSoup(res.text) + scripts = soup.find_all('script', attrs={'type': 'text/javascript'}) + scripts = [script.string.replace('\n', '') for script in scripts if script.string] - starting_string_lenght = len(starting_string) - try: - start_index = page_source.index(starting_string) + starting_string_lenght - end_index = page_source.index(ending_string, start_index) - except ValueError: - raise NotADownloadableLinkError( - f"Could not read the page source on {link}") + script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'') + chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts)) + if len(chosen_script) != 1: + raise NotADownloadableLinkError(f'Could not read page source from {link}') + else: + chosen_script = chosen_script[0] - while page_source[end_index] != "}": - end_index -= 1 - try: - data = page_source[start_index:end_index + 2].strip()[:-1] - except IndexError: - page_source[end_index + 1] = '}' - data = page_source[start_index:end_index + 3].strip()[:-1] + outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);') + image_dict = re.search(outer_regex, chosen_script).group(1) - return json.loads(data) + inner_regex = re.compile(r'image\s*:(.*),\s*group') + image_dict = re.search(inner_regex, image_dict).group(1) + + image_dict = json.loads(image_dict) + return image_dict @staticmethod def _validate_extension(extension_suffix: str) -> str: - possible_extensions = [".jpg", ".png", ".mp4", ".gif"] - for extension in possible_extensions: - if extension in extension_suffix: - return extension + possible_extensions = ('.jpg', '.png', '.mp4', '.gif') + selection = [ext for ext in possible_extensions if ext == extension_suffix] + if len(selection) == 1: + return selection[0] else: raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur') From 3ca2df067fce715e4f3e122ff91480b44909e408 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Mar 2021 18:58:32 +1000 Subject: [PATCH 162/276] Fix bs4 warning by specifying parser --- bulkredditdownloader/site_downloaders/erome.py | 2 +- bulkredditdownloader/site_downloaders/gallery.py | 2 +- bulkredditdownloader/site_downloaders/imgur.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index 6d6e76d..d9b48a3 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -35,7 +35,7 @@ class Erome(BaseDownloader): @staticmethod def _get_links(url: str) -> set[str]: page = requests.get(url) - soup = bs4.BeautifulSoup(page.text) + soup = bs4.BeautifulSoup(page.text, 'html.parser') front_images = soup.find_all('img', attrs={'class': 'lasyload'}) out = [im.get('data-src') for im in front_images] diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 814564d..22afc76 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -34,7 +34,7 @@ class Gallery(BaseDownloader): "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", } ) - soup = bs4.BeautifulSoup(page.text) + soup = bs4.BeautifulSoup(page.text, 'html.parser') links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) links = [link.get('href') for link in links] diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 73abba5..1d54feb 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -43,7 +43,7 @@ class Imgur(BaseDownloader): if res.status_code != 200: raise ResourceNotFound(f'Server responded with {res.status_code} to {link}') - soup = bs4.BeautifulSoup(res.text) + soup = bs4.BeautifulSoup(res.text, 'html.parser') scripts = soup.find_all('script', attrs={'type': 'text/javascript'}) scripts = [script.string.replace('\n', '') for script in scripts if script.string] From 72b2e30e90febe964233eb9db52f748142b6351c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Mar 2021 19:16:36 +1000 Subject: [PATCH 163/276] Add some things to README --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 7c825d5..b9251d6 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,13 @@ This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. +Some quick reference commands are: + + - `python3 -m bulkredditdownloader download --subreddit Python -L 10` + - `python3 -m bulkredditdownloader download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}'` + - `python3 -m bulkredditdownloader download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links` + - `python3 -m bulkredditdownloader archive --subreddit all --format yaml -L 500 --folder-scheme ''` + ## Usage The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. @@ -66,10 +73,14 @@ The following options are common between both the `archive` and `download` comma - `-m, --multireddit` - This is the name of a multireddit to add as a source - Can be specified multiple times + - This can be done by using `-m` multiple times + - Multireddits can also be used to provide CSV multireddits e.g. `-m 'chess, favourites` - The specified multireddits must all belong to the user specified with the `--user` option - `-s, --subreddit` - This adds a subreddit as a source - Can be used mutliple times + - This can be done by using `-s` multiple times + - Subreddits can also be used to provide CSV subreddits e.g. `-m 'all, python, mindustry` - `-t, --time` - This is the time filter that will be applied to all applicable sources - This option does not apply to upvoted or saved posts when scraping from these sources From faa3c207133e5a0d48ddabf9b35752eaafefbd89 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Mar 2021 19:52:01 +1000 Subject: [PATCH 164/276] Fix bug where folder is created too late --- bulkredditdownloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index f536adb..db9fe66 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -343,6 +343,7 @@ class RedditDownloader: f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}') return resource_hash = res.hash.hexdigest() + destination.parent.mkdir(parents=True, exist_ok=True) if resource_hash in self.master_hash_list: if self.args.no_dupes: logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere') @@ -352,7 +353,6 @@ class RedditDownloader: logger.debug( f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}') return - destination.parent.mkdir(parents=True, exist_ok=True) with open(destination, 'wb') as file: file.write(res.content) logger.debug(f'Written file to {destination}') From 0d78e16b2d9e713fdcd9b7c59bd785c3ca73f615 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Mar 2021 20:09:21 +1000 Subject: [PATCH 165/276] Make sure to log all exceptions to file --- bulkredditdownloader/__main__.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 3d87a72..78932b7 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -56,9 +56,14 @@ def cli_download(context: click.Context, **_): config = Configuration() config.process_click_arguments(context) setup_logging(config.verbose) - reddit_downloader = RedditDownloader(config) - reddit_downloader.download() - logger.info('Program complete') + try: + reddit_downloader = RedditDownloader(config) + reddit_downloader.download() + except Exception: + logger.exception('Downloader exited unexpectedly') + raise + else: + logger.info('Program complete') @cli.command('archive') @@ -69,9 +74,14 @@ def cli_archive(context: click.Context, **_): config = Configuration() config.process_click_arguments(context) setup_logging(config.verbose) - reddit_archiver = Archiver(config) - reddit_archiver.download() - logger.info('Program complete') + try: + reddit_archiver = Archiver(config) + reddit_archiver.download() + except Exception: + logger.exception('Downloader exited unexpectedly') + raise + else: + logger.info('Program complete') def setup_logging(verbosity: int): From c13f2806fa47549438528f2efa6d15d7ae9f2d30 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 21 Mar 2021 20:31:18 +1000 Subject: [PATCH 166/276] Fix case where Gfycat fails after redirect --- bulkredditdownloader/site_downloaders/gfycat.py | 7 ++++++- bulkredditdownloader/tests/downloaders/test_gfycat.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index 8eaf03b..a5051ca 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -28,7 +28,12 @@ class Gfycat(GifDeliveryNetwork): gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://gfycat.com/' + gfycat_id - page_source = requests.get(url).text + response = requests.get(url) + page_source = response.text + + if 'gifdeliverynetwork' in response.url: + return GifDeliveryNetwork._get_link(url) + soup = BeautifulSoup(page_source, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) diff --git a/bulkredditdownloader/tests/downloaders/test_gfycat.py b/bulkredditdownloader/tests/downloaders/test_gfycat.py index cca2f4a..a1b2a6d 100644 --- a/bulkredditdownloader/tests/downloaders/test_gfycat.py +++ b/bulkredditdownloader/tests/downloaders/test_gfycat.py @@ -13,6 +13,7 @@ from bulkredditdownloader.site_downloaders.gfycat import Gfycat @pytest.mark.parametrize(('test_url', 'expected_url'), ( ('https://gfycat.com/definitivecaninecrayfish', 'https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4'), ('https://gfycat.com/dazzlingsilkyiguana', 'https://giant.gfycat.com/DazzlingSilkyIguana.mp4'), + ('https://gfycat.com/webbedimpurebutterfly', 'https://thumbs2.redgifs.com/WebbedImpureButterfly.mp4'), )) def test_get_link(test_url: str, expected_url: str): result = Gfycat._get_link(test_url) From 7174121af25eb9f2b8e9a0c5fc1c269de9f4957c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 21 Mar 2021 22:40:45 +0300 Subject: [PATCH 167/276] Add v2-beta to title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b9251d6..e16d1a3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Bulk Downloader for Reddit +# Bulk Downloader for Reddit v2-beta This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. From 5b23f6688aa40a98be5193e0d812ee090e940013 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Thu, 25 Mar 2021 16:28:08 +1000 Subject: [PATCH 168/276] Filter non-submissions (#212) --- bulkredditdownloader/downloader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index db9fe66..8865720 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -316,6 +316,9 @@ class RedditDownloader: self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): + if not isinstance(submission, praw.models.Submission): + logger.warning(f'{submission.id} is not a submission') + return if not self.download_filter.check_url(submission.url): logger.debug(f'Download filter removed submission {submission.id} with URL {submission.url}') return From 82860a33416c3ed3a92472804bcf54d01f3aefb7 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Thu, 25 Mar 2021 16:28:22 +1000 Subject: [PATCH 169/276] Add specific requirements (#211) --- requirements.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 291cf96..e7b5ff1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -appdirs -bs4 -click -dict2xml -ffmpeg-python -praw -pyyaml -requests -youtube-dl \ No newline at end of file +appdirs>=1.4.4 +bs4>=0.0.1 +click>=7.1.2 +dict2xml>=1.7.0 +ffmpeg-python>=0.2.0 +praw>=7.2.0 +pyyaml>=5.4.1 +requests>=2.25.1 +youtube-dl>=2021.3.14 From d8b741191dc85800507ba56cf2d9963b8e0ecafc Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 22 Mar 2021 11:09:00 +1000 Subject: [PATCH 170/276] Fix typos --- README.md | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index e16d1a3..b6583cb 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,11 @@ Some quick reference commands are: The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. -There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc linked. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as well as all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. +There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. Many websites and links are supported for the downloader: - - Direct Links (links leading to a file) + - Direct links (links leading to a file) - Erome - Gfycat - Gif Delivery Network @@ -36,10 +36,10 @@ The following options are common between both the `archive` and `download` comma - This is the directory to which the BDFR will download and place all files - `--authenticate` - This flag will make the BDFR attempt to use an authenticated Reddit session - - See[Authentication](#authentication) for more details + - See [Authentication](#authentication) for more details - `--config` - If the path to a configuration file is supplied with this option, the BDFR will use the specified config - - See[Configuration Files](#configuration) for more details + - See [Configuration Files](#configuration) for more details - `--saved` - This option will make the BDFR use the supplied user's saved posts list as a download source - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` @@ -74,13 +74,13 @@ The following options are common between both the `archive` and `download` comma - This is the name of a multireddit to add as a source - Can be specified multiple times - This can be done by using `-m` multiple times - - Multireddits can also be used to provide CSV multireddits e.g. `-m 'chess, favourites` + - Multireddits can also be used to provide CSV multireddits e.g. `-m 'chess, favourites'` - The specified multireddits must all belong to the user specified with the `--user` option - `-s, --subreddit` - This adds a subreddit as a source - Can be used mutliple times - This can be done by using `-s` multiple times - - Subreddits can also be used to provide CSV subreddits e.g. `-m 'all, python, mindustry` + - Subreddits can also be used to provide CSV subreddits e.g. `-m 'all, python, mindustry'` - `-t, --time` - This is the time filter that will be applied to all applicable sources - This option does not apply to upvoted or saved posts when scraping from these sources @@ -113,11 +113,11 @@ The following options apply only to the `download` command. This command downloa - `--file-scheme` - Sets the scheme for files - Default is `{REDDITOR}_{TITLE}_{POSTID}` - - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details + - See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details - `--folder-scheme` - Sets the scheme for folders - Default is `{SUBREDDIT}` - - See[Folder and File Name Schemes](#folder-and-file-name-schemes) for more details + - See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details - `--skip-domain` - This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded - Can be specified multiple times @@ -136,19 +136,21 @@ The following options are for the `archive` command specifically. - `xml` - `yaml` -## Authentication +## Authentication and Secuirity -The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token - based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. +The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. -To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on. +To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Read this and **confirm that there are no more permissions than needed to run the program**. You should not grant unneeded permissions; by default, the BDFR only requests permission to read your saved or upvoted submissions and identify as you. + +If the permissions look safe, confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on. ## Changing Permissions -Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it is allowed by the BDFR. +Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it *is* allowed by the BDFR. -The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your token were to be compromised, however unlikely that actually is . Never grant more permissions than you absolutely need. +The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your token were to be compromised, however unlikely that actually is. Never grant more permissions than you absolutely need. -For more details on the configuration file and the values therein, see[Configuration Files](#configuration). +For more details on the configuration file and the values therein, see [Configuration Files](#configuration). ## Folder and File Name Schemes @@ -162,19 +164,30 @@ The naming and folder schemes for the BDFR are both completely customisable. A n - `TITLE` - `UPVOTES` -Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every downloaded post with the unique submission ID, you can use `{POSTID}`. Statis strings can also be included, such as `download_{POSTID}` which will not change from submission to submission. +Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every downloaded post with the unique submission ID, you can use `{POSTID}`. Static strings can also be included, such as `download_{POSTID}` which will not change from submission to submission. For example, the previous string will result in the following submission file names: -At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. + - `download_aaaaaa.png` + - `download_bbbbbb.png` + +At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. In both cases, there will be no separation between all submissions. ## Configuration -The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be `C:\Documents and Settings\\Application Data\Local Settings\BDFR\bulkredditdownloader` or `C:\Documents and Settings\\Application Data\BDFR\bulkredditdownloader`. On Mac OSX, this will be `~/Library/Application Support/bulkredditdownloader`. Lastly, on a Linux system, this will be `~/.local/share/bulkredditdownloader`. +The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be: + - `C:\Documents and Settings\\Application Data\Local Settings\BDFR\bulkredditdownloader` or + - `C:\Documents and Settings\\Application Data\BDFR\bulkredditdownloader` + +On Mac OSX, this will be: + - `~/Library/Application Support/bulkredditdownloader`. + +Lastly, on a Linux system, this will be: + - `~/.local/share/bulkredditdownloader` The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report. ### Configuration File -The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys ** must ** be included in the configuration file supplied. +The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys **must** be included in the configuration file supplied. - `client_id` - `client_secret` From b3c51f49ce3bb9bb5b0cb2cc72c89e91b37428e9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 22 Mar 2021 11:17:08 +1000 Subject: [PATCH 171/276] Copy config file to config directory first time --- bulkredditdownloader/downloader.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 8865720..058d258 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -7,6 +7,7 @@ import importlib.resources import logging import os import re +import shutil import socket from datetime import datetime from enum import Enum, auto @@ -142,7 +143,6 @@ class RedditDownloader: Path('./default_config.cfg'), Path(self.config_directory, 'config.cfg'), Path(self.config_directory, 'default_config.cfg'), - list(importlib.resources.path('bulkredditdownloader', 'default_config.cfg').gen)[0], ] self.config_location = None for path in possible_paths: @@ -150,6 +150,10 @@ class RedditDownloader: self.config_location = path logger.debug(f'Loading configuration from {path}') break + if not self.config_location: + self.config_location = list(importlib.resources.path('bulkredditdownloader', 'default_config.cfg').gen)[0] + shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) + logger.debug('Copied default config file from module to config folder') if not self.config_location: raise errors.BulkDownloaderException('Could not find a configuration file to load') self.cfg_parser.read(self.config_location) From 0007912ad57df399a7120fa34d37deccbd883c0e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 22 Mar 2021 14:21:56 +1000 Subject: [PATCH 172/276] Scrub windows paths for invalid characters --- bulkredditdownloader/file_name_formatter.py | 15 ++++++++++++++- .../tests/test_file_name_formatter.py | 13 +++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index f36284e..dbb8d08 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -2,6 +2,7 @@ # coding=utf-8 import logging +import platform import re from pathlib import Path from typing import Optional @@ -41,6 +42,10 @@ class FileNameFormatter: logger.log(9, f'Found key string {key} in name') result = result.replace('/', '') + + if platform.system() == 'Windows': + result = FileNameFormatter._format_for_windows(result) + return result def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: @@ -51,6 +56,7 @@ class FileNameFormatter: ending = index + resource.extension file_name = str(self._format_name(resource.source_submission, self.file_format_string)) file_name = self._limit_file_name_length(file_name, ending) + try: file_path = Path(subfolder, file_name) except TypeError: @@ -76,8 +82,15 @@ class FileNameFormatter: out.append((self.format_path(res, destination_directory, i), res)) return out - @ staticmethod + @staticmethod def validate_string(test_string: str) -> bool: if not test_string: return False return any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms]) + + @staticmethod + def _format_for_windows(input_string: str) -> str: + invalid_characters = r'<>:"\/|?*' + for char in invalid_characters: + input_string = input_string.replace(char, '') + return input_string diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index b376e9d..55909df 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -169,3 +169,16 @@ def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path): result = test_formatter.format_path(test_resource, tmp_path) result.parent.mkdir(parents=True) result.touch() + + +@pytest.mark.parametrize(('test_string', 'expected'), ( + ('test', 'test'), + ('test.png', 'test.png'), + ('test*', 'test'), + ('test**', 'test'), + ('test?*', 'test'), + ('test_???.png', 'test_.png'), +)) +def test_format_file_name_for_windows(test_string: str, expected: str): + result = FileNameFormatter._format_for_windows(test_string) + assert result == expected From 79105f9f84ff38dff6f38bd179a0116818f9665b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 25 Mar 2021 21:51:31 +1000 Subject: [PATCH 173/276] Fix typo --- bulkredditdownloader/tests/downloaders/test_direct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/downloaders/test_direct.py b/bulkredditdownloader/tests/downloaders/test_direct.py index 32cc483..9f8163a 100644 --- a/bulkredditdownloader/tests/downloaders/test_direct.py +++ b/bulkredditdownloader/tests/downloaders/test_direct.py @@ -15,7 +15,7 @@ from bulkredditdownloader.site_downloaders.direct import Direct ('https://giant.gfycat.com/DazzlingSilkyIguana.mp4', '808941b48fc1e28713d36dd7ed9dc648'), )) def test_download_resource(test_url: str, expected_hash: str): - mock_submission = Mock + mock_submission = Mock() mock_submission.url = test_url test_site = Direct(mock_submission) resources = test_site.find_resources() From 771cc711e42ea17c49500a4572fb913967eadedf Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Mar 2021 10:42:51 +1000 Subject: [PATCH 174/276] Calculate existing hashes in parallel --- bulkredditdownloader/downloader.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 058d258..03c468c 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -11,6 +11,7 @@ import shutil import socket from datetime import datetime from enum import Enum, auto +from multiprocessing import Pool from pathlib import Path from typing import Iterator @@ -31,6 +32,12 @@ from bulkredditdownloader.site_downloaders.download_factory import DownloadFacto logger = logging.getLogger(__name__) +def _calc_hash(existing_file: Path): + with open(existing_file, 'rb') as file: + file_hash = hashlib.md5(file.read()).hexdigest() + return existing_file, file_hash + + class RedditTypes: class SortType(Enum): HOT = auto() @@ -373,9 +380,10 @@ class RedditDownloader: for (dirpath, dirnames, filenames) in os.walk(directory): files.extend([Path(dirpath, file) for file in filenames]) logger.info(f'Calculating hashes for {len(files)} files') - hash_list = {} - for existing_file in files: - with open(existing_file, 'rb') as file: - hash_list[hashlib.md5(file.read()).hexdigest()] = existing_file - logger.log(9, f'Hash calculated for file at {existing_file}') + + pool = Pool(15) + results = pool.map(_calc_hash, files) + pool.close() + + hash_list = {res[1]: res[0] for res in results} return hash_list From 1cb8240eb61dcc64a9727899470d4e0003474c79 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Mar 2021 21:12:52 +1000 Subject: [PATCH 175/276] Remove incorrect test marks --- bulkredditdownloader/tests/test_file_name_formatter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 55909df..faa418f 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -156,9 +156,7 @@ def test_limit_filename_length(test_filename: str, test_ending: str): assert isinstance(result, str) -@pytest.mark.online -@pytest.mark.reddit -def test_shorten_filenames(reddit_instance: praw.Reddit, tmp_path: Path): +def test_shorten_filenames(tmp_path: Path): test_submission = MagicMock() test_submission.title = 'A' * 300 test_submission.author.name = 'test' From 3cb51e638b6c9db5fd2c6afa21488a75b6133b55 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Mar 2021 21:38:04 +1000 Subject: [PATCH 176/276] Preserve appended ID when shortening file names --- bulkredditdownloader/file_name_formatter.py | 3 +++ .../tests/test_file_name_formatter.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index dbb8d08..78af753 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -65,6 +65,9 @@ class FileNameFormatter: @staticmethod def _limit_file_name_length(filename: str, ending: str) -> str: + possible_id = re.search(r'((?:_\w{6})?$)', filename).group(1) + ending = possible_id + ending + filename = filename.strip(possible_id) max_length_chars = 255 - len(ending) max_length_bytes = 255 - len(ending.encode('utf-8')) while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes: diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index faa418f..a5676a5 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -156,6 +156,23 @@ def test_limit_filename_length(test_filename: str, test_ending: str): assert isinstance(result, str) +@pytest.mark.parametrize(('test_filename', 'test_ending', 'expected_end'), ( + ('test_aaaaaa', '_1.png', 'test_aaaaaa_1.png'), + ('test_aaaaaa', '.png', 'test_aaaaaa.png'), + ('test', '_1.png', 'test_1.png'), + ('test_m1hqw6', '_1.png', 'test_m1hqw6_1.png'), + ('A' * 300 + '_bbbccc', '.png', '_bbbccc.png'), + ('A' * 300 + '_bbbccc', '_1000.jpeg', '_bbbccc_1000.jpeg'), + ('😍💕✨' * 100 + '_aaa1aa', '_1.png', '_aaa1aa_1.png'), +)) +def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str, expected_end: str): + result = FileNameFormatter._limit_file_name_length(test_filename, test_ending) + assert len(result) <= 255 + assert len(result.encode('utf-8')) <= 255 + assert isinstance(result, str) + assert result.endswith(expected_end) + + def test_shorten_filenames(tmp_path: Path): test_submission = MagicMock() test_submission.title = 'A' * 300 From 56347da07e863ccf4d63cd45f8cf13da653898a9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Mar 2021 21:42:47 +1000 Subject: [PATCH 177/276] Fix test creating unwanted folders --- bulkredditdownloader/tests/test_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 9b05c9e..3d1bba7 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -46,6 +46,7 @@ def assert_all_results_are_submissions(result_limit: int, results: list[Iterator def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): downloader_mock.args.directory = tmp_path / 'test' + downloader_mock.config_directories.user_config_dir = tmp_path RedditDownloader._determine_directories(downloader_mock) assert Path(tmp_path / 'test').exists() From 21e8f0f8b96d07f90b7bdf927fa8ea15eca90ead Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sat, 27 Mar 2021 17:58:43 +1000 Subject: [PATCH 178/276] Add option to exclude submission IDs (#220) * Add option to exclude submission IDs * Update README * Update logging message --- README.md | 7 +++++ bulkredditdownloader/__main__.py | 6 ++-- bulkredditdownloader/configuration.py | 2 ++ bulkredditdownloader/downloader.py | 29 ++++++++++++++++--- bulkredditdownloader/tests/test_downloader.py | 26 +++++++++++++++++ .../tests/test_integration.py | 15 ++++++++++ 6 files changed, 79 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b6583cb..0c3cdf9 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,13 @@ The following options are common between both the `archive` and `download` comma The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. +- `--exclude-id` + - This will skip the download of any submission with the ID provided + - Can be specified multiple times +- `--exclude-id-file` + - This will skip the download of any submission with any of the IDs in the files provided + - Can be specified multiple times + - Format is one ID per line - `--make-hard-links` - This flag will create hard links to an existing file when a duplicate is downloaded - This will make the file appear in multiple directories while only taking the space of a single instance diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 78932b7..a3574e1 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -43,11 +43,13 @@ def cli(): @cli.command('download') +@click.option('--exclude-id', default=None, multiple=True) +@click.option('--exclude-id-file', default=None, multiple=True) +@click.option('--file-scheme', default=None, type=str) +@click.option('--folder-scheme', default=None, type=str) @click.option('--make-hard-links', is_flag=True, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) -@click.option('--file-scheme', default=None, type=str) -@click.option('--folder-scheme', default=None, type=str) @click.option('--skip', default=None, multiple=True) @click.option('--skip-domain', default=None, multiple=True) @_add_common_options diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 1227590..5cb23b3 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -13,6 +13,8 @@ class Configuration(Namespace): self.authenticate = False self.config = None self.directory: str = '.' + self.exclude_id = [] + self.exclude_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] self.multireddit: list[str] = [] diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 03c468c..58efeb5 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -82,6 +82,8 @@ class RedditDownloader: self._create_reddit_instance() self._resolve_user_name() + self.excluded_submission_ids = self._read_excluded_ids() + if self.args.search_existing: self.master_hash_list = self.scan_existing_files(self.download_directory) else: @@ -323,8 +325,12 @@ class RedditDownloader: def download(self): for generator in self.reddit_lists: for submission in generator: - logger.debug(f'Attempting to download submission {submission.id}') - self._download_submission(submission) + if submission.id in self.excluded_submission_ids: + logger.debug(f'Submission {submission.id} in exclusion list, skipping') + continue + else: + logger.debug(f'Attempting to download submission {submission.id}') + self._download_submission(submission) def _download_submission(self, submission: praw.models.Submission): if not isinstance(submission, praw.models.Submission): @@ -354,13 +360,15 @@ class RedditDownloader: res.download() except errors.BulkDownloaderException: logger.error( - f'Failed to download resource from {res.url} with downloader {downloader_class.__name__}') + f'Failed to download resource {res.url} with downloader {downloader_class.__name__}') return resource_hash = res.hash.hexdigest() destination.parent.mkdir(parents=True, exist_ok=True) if resource_hash in self.master_hash_list: if self.args.no_dupes: - logger.warning(f'Resource from "{res.url}" and hash "{resource_hash}" downloaded elsewhere') + logger.warning( + f'Resource from "{res.url}" and hash "{resource_hash}" from submission {submission.id}' + ' downloaded elsewhere') return elif self.args.make_hard_links: self.master_hash_list[resource_hash].link_to(destination) @@ -387,3 +395,16 @@ class RedditDownloader: hash_list = {res[1]: res[0] for res in results} return hash_list + + def _read_excluded_ids(self) -> set[str]: + out = [] + out.extend(self.args.exclude_id) + for id_file in self.args.exclude_id_file: + id_file = Path(id_file).resolve().expanduser() + if not id_file.exists(): + logger.error(f'ID exclusion file at {id_file} does not exist') + continue + with open(id_file, 'r') as file: + for line in file: + out.append(line.strip()) + return set(out) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 3d1bba7..1d43521 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -394,3 +394,29 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta assert test_file_1_stats.st_nlink == 2 assert test_file_1_stats.st_ino == test_file_2_inode + + +@pytest.mark.parametrize(('test_ids', 'test_excluded', 'expected_len'), ( + (('aaaaaa',), (), 1), + (('aaaaaa',), ('aaaaaa',), 0), + ((), ('aaaaaa',), 0), + (('aaaaaa', 'bbbbbb'), ('aaaaaa',), 1), +)) +def test_excluded_ids(test_ids: tuple[str], test_excluded: tuple[str], expected_len: int, downloader_mock: MagicMock): + downloader_mock.excluded_submission_ids = test_excluded + test_submissions = [] + for test_id in test_ids: + m = MagicMock() + m.id = test_id + test_submissions.append(m) + downloader_mock.reddit_lists = [test_submissions] + RedditDownloader.download(downloader_mock) + assert downloader_mock._download_submission.call_count == expected_len + + +def test_read_excluded_submission_ids_from_file(downloader_mock: MagicMock, tmp_path: Path): + test_file = tmp_path / 'test.txt' + test_file.write_text('aaaaaa\nbbbbbb') + downloader_mock.args.exclude_id_file = [test_file] + results = RedditDownloader._read_excluded_ids(downloader_mock) + assert results == {'aaaaaa', 'bbbbbb'} diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index a50d7f7..4daebad 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -239,3 +239,18 @@ def test_cli_download_use_default_config(tmp_path: Path): test_args = ['download', '-vv', str(tmp_path)] result = runner.invoke(cli, test_args) assert result.exit_code == 0 + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g', '--exclude-id', 'm2601g'], +)) +def test_cli_download_links(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'in exclusion list' in result.output + assert 'Downloaded submission ' not in result.output From 91bebe5f68f94f6a6ab18cb5cf95857acca91370 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 27 Mar 2021 20:39:36 +1000 Subject: [PATCH 179/276] Add some to ARCHITECTURE --- docs/ARCHITECTURE.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 9634cad..7b69f99 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -2,6 +2,18 @@ When the project was rewritten for v2, the goal was to make the codebase easily extensible and much easier to read and modify. However, this document provides a step-by-step look through the process that the BDFR goes through, so that any prospective developers can more easily grasp the way the code works. +## Design Ethos + +The BDFR is designed to be a stateless downloader. This means that the state of the program is forgotten between each run of the program. There are no central lists, databases, or indices, that the BDFR uses, only the actual files on disk. There are several advantages to this approach: + + 1. There is no chance of the database being corrupted or changed by something other than the BDFR, rendering the BDFR's "idea" of the archive wrong or incomplete. + 2. Any information about the archive is contained by the archive itself i.e. for a list of all submission IDs in the archive, this can be extracted from the names of the files in said archive, assuming an appropriate naming scheme was used. + 3. Archives can be merged, split, or editing without worrying about having to update a central database + 4. There are no versioning issues between updates of the BDFR, where old version are stuck with a worse form of the database + 5. An archive can be put on a USB, moved to another computer with possibly a very different BDFR version, and work completely fine + +Another major part of the ethos of the design is DOTADIW, Do One Thing And Do It Well. It's a major part of Unix philosophy and states that each tool should have a well-defined, limited purpose. To this end, the BDFR is, as the name implies, a *downloader*. That is the scope of the tool. Managing the files downloaded can be for better-suited programs, since the BDFR is not a file manager. Nor the BDFR concern itself with how any of the data downloaded is displayed, changed, parsed, or analysed. This makes the BDFR suitable for data science-related tasks, archiving, personal downloads, or analysis of various Reddit sources as the BDFR is completely agnostic on how the data is used. + ## The Download Process The BDFR is organised around a central object, the RedditDownloader class. The Archiver object extends and inherits from this class. @@ -34,4 +46,4 @@ Once the downloader class has been written **and tests added** for it as well, t ## Adding Other Features - +For a fundamentally different form of execution path for the program, such as the difference between the `archive` and `download` commands, it is best to inherit from the RedditDownloader class and override or add functionality as needed. From 20f525bd0db7e05e51c4e6e22efd26bb6c4d5b7f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 27 Mar 2021 20:50:09 +1000 Subject: [PATCH 180/276] Update some logging levels --- bulkredditdownloader/downloader.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 58efeb5..d1b75e8 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -354,7 +354,7 @@ class RedditDownloader: return for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): - logger.warning(f'File already exists: {destination}') + logger.debug(f'File {destination} already exists, continuing') else: try: res.download() @@ -366,13 +366,12 @@ class RedditDownloader: destination.parent.mkdir(parents=True, exist_ok=True) if resource_hash in self.master_hash_list: if self.args.no_dupes: - logger.warning( - f'Resource from "{res.url}" and hash "{resource_hash}" from submission {submission.id}' - ' downloaded elsewhere') + logger.info( + f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere') return elif self.args.make_hard_links: self.master_hash_list[resource_hash].link_to(destination) - logger.debug( + logger.info( f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}') return with open(destination, 'wb') as file: From f2946c0a8750a9f16571057665185ae39cdd1f66 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sat, 27 Mar 2021 21:14:08 +1000 Subject: [PATCH 181/276] Strip emojis from filenames on Windows (#222) --- bulkredditdownloader/file_name_formatter.py | 6 ++++++ .../tests/test_file_name_formatter.py | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 78af753..852f661 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -96,4 +96,10 @@ class FileNameFormatter: invalid_characters = r'<>:"\/|?*' for char in invalid_characters: input_string = input_string.replace(char, '') + input_string = FileNameFormatter._strip_emojis(input_string) return input_string + + @staticmethod + def _strip_emojis(input_string: str) -> str: + result = input_string.encode('ascii', errors='ignore').decode('utf-8') + return result diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index a5676a5..7a433ef 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -188,12 +188,24 @@ def test_shorten_filenames(tmp_path: Path): @pytest.mark.parametrize(('test_string', 'expected'), ( ('test', 'test'), + ('test😍', 'test'), ('test.png', 'test.png'), ('test*', 'test'), ('test**', 'test'), ('test?*', 'test'), ('test_???.png', 'test_.png'), + ('test_???😍.png', 'test_.png'), )) def test_format_file_name_for_windows(test_string: str, expected: str): result = FileNameFormatter._format_for_windows(test_string) assert result == expected + + +@pytest.mark.parametrize(('test_string', 'expected'), ( + ('test', 'test'), + ('test😍', 'test'), + ('😍', ''), +)) +def test_strip_emojies(test_string: str, expected: str): + result = FileNameFormatter._strip_emojis(test_string) + assert result == expected From 443cf1af01a0e5d96a95c7163c9455ff73be9375 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 27 Mar 2021 17:13:13 +0300 Subject: [PATCH 182/276] Bug fix on deleted self posts --- bulkredditdownloader/site_downloaders/self_post.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index 4773c65..5b98000 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -36,8 +36,8 @@ class SelfPost(BaseDownloader): + "](https://www.reddit.com/r/" + self.post.subreddit.title + ") by [u/" - + self.post.author.name + + self.post.author.name if self.post.author else "DELETED" + "](https://www.reddit.com/user/" - + self.post.author.name + + self.post.author.name if self.post.author else "DELETED" + ")") return content From 58150570a3a00671ea1b0071a3885e99cacf7b84 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 27 Mar 2021 23:55:17 +0300 Subject: [PATCH 183/276] Tests added --- bulkredditdownloader/tests/downloaders/test_self_post.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/tests/downloaders/test_self_post.py b/bulkredditdownloader/tests/downloaders/test_self_post.py index d6a45f4..c7e8b4e 100644 --- a/bulkredditdownloader/tests/downloaders/test_self_post.py +++ b/bulkredditdownloader/tests/downloaders/test_self_post.py @@ -13,6 +13,7 @@ from bulkredditdownloader.site_downloaders.self_post import SelfPost @pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( ('ltmivt', '7d2c9e4e989e5cf2dca2e55a06b1c4f6'), ('ltoaan', '221606386b614d6780c2585a59bd333f'), + ('d3sc8o', 'b675974cd6964246c6e97a10a385c080'), )) def test_find_resource(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): submission = reddit_instance.submission(id=test_submission_id) From 247fe3e6f72edcfca753f882d6b1c4e2a91d172a Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 27 Mar 2021 23:00:47 +0300 Subject: [PATCH 184/276] Fix downloading .gifv instead of .mp4 (#228) --- bulkredditdownloader/site_downloaders/download_factory.py | 2 +- bulkredditdownloader/site_downloaders/imgur.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 253e13b..a4d96cf 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -30,7 +30,7 @@ class DownloadFactory: return Gfycat elif re.match(url_beginning + r'gifdeliverynetwork', url): return GifDeliveryNetwork - elif re.match(url_beginning + r'imgur.*', url): + elif re.match(url_beginning + r'imgur.*', url) or re.match(url_beginning + r'i\.imgur.*\.gifv$', url): return Imgur elif re.match(url_beginning + r'redgifs.com', url): return Redgifs diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 1d54feb..bf8a0e9 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -38,6 +38,8 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: + if re.match(r".*i\.imgur\.com.*\.gifv$", link): + link = link.replace("i.imgur", "imgur")[:-5] res = requests.get(link, cookies={'over18': '1', 'postpagebeta': '0'}) if res.status_code != 200: From 49b0fac7c7921bd989e423170d056cfd46b2cd4d Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 27 Mar 2021 23:50:23 +0300 Subject: [PATCH 185/276] Add necessary tests --- .../tests/downloaders/test_download_factory.py | 2 +- bulkredditdownloader/tests/downloaders/test_imgur.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 9d6624f..5e1b1d0 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -26,7 +26,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube ('lu29zn', SelfPost), ('lu2ykk', Direct), # Imgur direct link ('luh2pd', Direct), # Reddit direct link - ('luo9eo', Direct), # Imgur direct link gif + ('luo9eo', Imgur), # Imgur .gifv link ('lumulo', Direct), # Imgur direct link gif ('lui5t3', Imgur), ('lu93m7', Gallery), diff --git a/bulkredditdownloader/tests/downloaders/test_imgur.py b/bulkredditdownloader/tests/downloaders/test_imgur.py index 6f6a9f4..66cbbd0 100644 --- a/bulkredditdownloader/tests/downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/downloaders/test_imgur.py @@ -28,7 +28,7 @@ from bulkredditdownloader.site_downloaders.imgur import Imgur {'hash': 'TSAkikk', 'ext': '.jpg', 'title': ''}, ]), )) -def test_get_data(test_url: str, expected_gen_dict: dict, expected_image_dict: list[dict]): +def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_dict: list[dict]): result = Imgur._get_data(test_url) assert all([result.get(key) == expected_gen_dict[key] for key in expected_gen_dict.keys()]) @@ -36,6 +36,15 @@ def test_get_data(test_url: str, expected_gen_dict: dict, expected_image_dict: l assert any([all([image.get(key) == image_dict[key] for key in image_dict.keys()]) for image_dict in expected_image_dict for image in result['album_images']['images']]) +@pytest.mark.online +@pytest.mark.parametrize(('test_url', 'expected_image_dict'), ( + ('https://i.imgur.com/dLk3FGY.gifv', + {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True}), +)) +def test_get_data_image(test_url: str, expected_image_dict: dict): + result = Imgur._get_data(test_url) + assert all([result.get(key) == expected_image_dict[key] for key in expected_image_dict.keys()]) + @pytest.mark.parametrize('test_extension', ('.gif', '.png', '.jpg', '.mp4') From 8753fa0e459b8e839347da51884e030300f8f222 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 28 Mar 2021 10:05:45 +1000 Subject: [PATCH 186/276] Add additional test --- bulkredditdownloader/tests/downloaders/test_imgur.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_imgur.py b/bulkredditdownloader/tests/downloaders/test_imgur.py index 66cbbd0..6645737 100644 --- a/bulkredditdownloader/tests/downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/downloaders/test_imgur.py @@ -36,12 +36,17 @@ def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_d assert any([all([image.get(key) == image_dict[key] for key in image_dict.keys()]) for image_dict in expected_image_dict for image in result['album_images']['images']]) + @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_image_dict'), ( ('https://i.imgur.com/dLk3FGY.gifv', - {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True}), + {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True} + ), + ('https://i.imgur.com/dLk3FGY.gifv', + {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4'} + ), )) -def test_get_data_image(test_url: str, expected_image_dict: dict): +def test_get_data_gif(test_url: str, expected_image_dict: dict): result = Imgur._get_data(test_url) assert all([result.get(key) == expected_image_dict[key] for key in expected_image_dict.keys()]) From 8bd4b8b3a975328990bad935b2404c109f0806b8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 28 Mar 2021 10:10:46 +1000 Subject: [PATCH 187/276] Split regex for download factory --- bulkredditdownloader/site_downloaders/download_factory.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index a4d96cf..466dc3d 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -30,7 +30,9 @@ class DownloadFactory: return Gfycat elif re.match(url_beginning + r'gifdeliverynetwork', url): return GifDeliveryNetwork - elif re.match(url_beginning + r'imgur.*', url) or re.match(url_beginning + r'i\.imgur.*\.gifv$', url): + elif re.match(url_beginning + r'imgur.*', url): + return Imgur + elif re.match(url_beginning + r'i\.imgur.*\.gifv$', url): return Imgur elif re.match(url_beginning + r'redgifs.com', url): return Redgifs @@ -40,7 +42,9 @@ class DownloadFactory: return VReddit elif re.match(url_beginning + r'youtu\.?be', url): return Youtube - elif re.match(url_beginning + r'i\.redd\.it.*', url) or re.match(url_beginning + r'.*\..{3,4}$', url): + elif re.match(url_beginning + r'i\.redd\.it.*', url): + return Direct + elif re.match(url_beginning + r'.*\..{3,4}$', url): return Direct else: raise NotADownloadableLinkError('No downloader module exists for url {}'.format(url)) From f0d3cfefc068ee41dd80e362520d34b94e576fb0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 28 Mar 2021 10:15:21 +1000 Subject: [PATCH 188/276] Remove splice and fix quotes --- bulkredditdownloader/site_downloaders/imgur.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index bf8a0e9..9e311d6 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -38,8 +38,10 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: - if re.match(r".*i\.imgur\.com.*\.gifv$", link): - link = link.replace("i.imgur", "imgur")[:-5] + if re.match(r'.*i\.imgur\.com.*\.gifv$', link): + link = link.replace('i.imgur', 'imgur') + link = link.rstrip('.gifv') + res = requests.get(link, cookies={'over18': '1', 'postpagebeta': '0'}) if res.status_code != 200: From cbcbf400bb578da87cf0ee49c9ce3196eae3f600 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 28 Mar 2021 10:23:51 +1000 Subject: [PATCH 189/276] Add test for nonexistent key --- bulkredditdownloader/tests/test_file_name_formatter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 7a433ef..125bea1 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -36,7 +36,8 @@ def reddit_submission(reddit_instance) -> praw.models.Submission: ('{UPVOTES}', '1000'), ('{FLAIR}', 'test_flair'), ('{DATE}', '123456789'), - ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345') + ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345'), + ('{RANDOM}', '{RANDOM}'), )) def test_format_name_mock(format_string: str, expected: str, submission: MagicMock): result = FileNameFormatter._format_name(submission, format_string) From 5e6159ade398272f8230fa10e5eb725403f35095 Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Sun, 28 Mar 2021 16:31:03 -0400 Subject: [PATCH 190/276] Add created date to archive output --- bulkredditdownloader/archive_entry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/archive_entry.py b/bulkredditdownloader/archive_entry.py index a223c66..c9668e7 100644 --- a/bulkredditdownloader/archive_entry.py +++ b/bulkredditdownloader/archive_entry.py @@ -38,6 +38,7 @@ class ArchiveEntry: 'link_flair_text': self.submission.link_flair_text, 'num_comments': self.submission.num_comments, 'over_18': self.submission.over_18, + 'created_utc': self.submission.created_utc, } def _get_comments(self): From 265505efc71a0495aadbeead40a7dd5448ba0c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Mon, 29 Mar 2021 11:21:47 +0300 Subject: [PATCH 191/276] Fix missing links in self post footer (#235) --- bulkredditdownloader/site_downloaders/self_post.py | 4 ++-- bulkredditdownloader/tests/downloaders/test_self_post.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bulkredditdownloader/site_downloaders/self_post.py index 5b98000..a141fbb 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bulkredditdownloader/site_downloaders/self_post.py @@ -36,8 +36,8 @@ class SelfPost(BaseDownloader): + "](https://www.reddit.com/r/" + self.post.subreddit.title + ") by [u/" - + self.post.author.name if self.post.author else "DELETED" + + (self.post.author.name if self.post.author else "DELETED") + "](https://www.reddit.com/user/" - + self.post.author.name if self.post.author else "DELETED" + + (self.post.author.name if self.post.author else "DELETED") + ")") return content diff --git a/bulkredditdownloader/tests/downloaders/test_self_post.py b/bulkredditdownloader/tests/downloaders/test_self_post.py index c7e8b4e..315388f 100644 --- a/bulkredditdownloader/tests/downloaders/test_self_post.py +++ b/bulkredditdownloader/tests/downloaders/test_self_post.py @@ -13,7 +13,7 @@ from bulkredditdownloader.site_downloaders.self_post import SelfPost @pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( ('ltmivt', '7d2c9e4e989e5cf2dca2e55a06b1c4f6'), ('ltoaan', '221606386b614d6780c2585a59bd333f'), - ('d3sc8o', 'b675974cd6964246c6e97a10a385c080'), + ('d3sc8o', 'c1ff2b6bd3f6b91381dcd18dfc4ca35f'), )) def test_find_resource(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): submission = reddit_instance.submission(id=test_submission_id) From 2e879949f55bbfe2e44c118bf7037035bf481b5c Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 27 Mar 2021 23:17:37 +0300 Subject: [PATCH 192/276] Send HTTP200 when oauth2 is successful --- bulkredditdownloader/oauth2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/oauth2.py b/bulkredditdownloader/oauth2.py index 9678b45..b8ec2c4 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bulkredditdownloader/oauth2.py @@ -62,6 +62,7 @@ class OAuth2Authenticator: self.send_message(client) raise RedditAuthenticationError(f'Error in OAuth2: {params["error"]}') + self.send_message(client, "") refresh_token = reddit.auth.authorize(params["code"]) return refresh_token @@ -80,8 +81,8 @@ class OAuth2Authenticator: return client @staticmethod - def send_message(client: socket.socket): - client.send('HTTP/1.1 200 OK'.encode('utf-8')) + def send_message(client: socket.socket, message: str): + client.send(f'HTTP/1.1 200 OK\r\n\r\n{message}'.encode('utf-8')) client.close() From 5d86c2d400d20edf5dff44f4486ce350f981ecfa Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 30 Mar 2021 09:05:09 +1000 Subject: [PATCH 193/276] Update out-of-date tests --- bulkredditdownloader/tests/test_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 1d43521..db97f7d 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -312,7 +312,7 @@ def test_download_submission_file_exists( folder_contents = list(tmp_path.iterdir()) output = capsys.readouterr() assert len(folder_contents) == 1 - assert 'File already exists: ' in output.out + assert 'Arneeman_Metagaming isn\'t always a bad thing_m1hqw6.png already exists' in output.out @pytest.mark.online @@ -335,7 +335,7 @@ def test_download_submission_hash_exists( folder_contents = list(tmp_path.iterdir()) output = capsys.readouterr() assert len(folder_contents) == 0 - assert re.search(r'Resource from .*? downloaded elsewhere', output.out) + assert re.search(r'Resource hash .*? downloaded elsewhere', output.out) @pytest.mark.parametrize(('test_name', 'expected'), ( From 44889d5264962e0899ba4e295deb7d7e99d744e1 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Tue, 30 Mar 2021 17:08:08 +1000 Subject: [PATCH 194/276] Tweak regex to fix incorrect match (#237) --- bulkredditdownloader/site_downloaders/download_factory.py | 2 +- .../tests/downloaders/test_download_factory.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 466dc3d..0431d30 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -44,7 +44,7 @@ class DownloadFactory: return Youtube elif re.match(url_beginning + r'i\.redd\.it.*', url): return Direct - elif re.match(url_beginning + r'.*\..{3,4}$', url): + elif re.match(url_beginning + r'.*/.*\.\w{3,4}$', url): return Direct else: raise NotADownloadableLinkError('No downloader module exists for url {}'.format(url)) diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 5e1b1d0..6f7cba7 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -46,6 +46,10 @@ def test_factory_lever_good(test_submission_id: str, expected_class: BaseDownloa @pytest.mark.parametrize('test_url', ( 'random.com', 'bad', + 'https://www.google.com/', + 'https://www.google.com', + 'https://www.google.com/test', + 'https://www.google.com/test/', )) def test_factory_lever_bad(test_url: str): with pytest.raises(NotADownloadableLinkError): From 3d2e11dc1d7120365479cb464a7813590f93a04b Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Tue, 30 Mar 2021 17:20:05 +1000 Subject: [PATCH 195/276] Add warning for non-unique file name schemes (#233) * Add warning for non-unique file name schemes * Update README with warning --- README.md | 2 ++ bulkredditdownloader/file_name_formatter.py | 9 ++++++++- bulkredditdownloader/tests/test_integration.py | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0c3cdf9..605f1a2 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,8 @@ Each of these can be enclosed in curly bracket, `{}`, and included in the name. At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. In both cases, there will be no separation between all submissions. +It is highly recommended that the file name scheme contain the parameter `{POSTID}` as this is **the only parameter guaranteed to be unique**. No combination of other keys will necessarily be unique and may result in posts being skipped as the BDFR will see files by the same name and skip the download, assuming that they are already downloaded. + ## Configuration The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be: diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 852f661..fae4f66 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -89,7 +89,14 @@ class FileNameFormatter: def validate_string(test_string: str) -> bool: if not test_string: return False - return any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms]) + result = any([f'{{{key}}}' in test_string.lower() for key in FileNameFormatter.key_terms]) + if result: + if 'POSTID' not in test_string: + logger.warning( + f'Post ID not included in this file scheme, so file names are not guaranteed to be unique') + return True + else: + return False @staticmethod def _format_for_windows(input_string: str) -> str: diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index 4daebad..a69a155 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -254,3 +254,18 @@ def test_cli_download_links(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 assert 'in exclusion list' in result.output assert 'Downloaded submission ' not in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--file-scheme', '{TITLE}'], + ['--file-scheme', '{TITLE}_test_{SUBREDDIT}'], +)) +def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'Post ID not included in this file scheme' in result.output From 7a436d04815a29d4253c9d2d0a23eeb22a82acc2 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 27 Mar 2021 19:21:02 +0300 Subject: [PATCH 196/276] Use .format() instead of regular expression --- bulkredditdownloader/file_name_formatter.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index fae4f66..5c5144a 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -27,20 +27,15 @@ class FileNameFormatter: @staticmethod def _format_name(submission: praw.models.Submission, format_string: str) -> str: submission_attributes = { - 'title': submission.title, - 'subreddit': submission.subreddit.display_name, - 'redditor': submission.author.name if submission.author else 'DELETED', - 'postid': submission.id, - 'upvotes': submission.score, - 'flair': submission.link_flair_text, - 'date': submission.created_utc + 'TITLE': submission.title, + 'SUBREDDIT': submission.subreddit.display_name, + 'REDDITOR': submission.author.name if submission.author else 'DELETED', + 'POSTID': submission.id, + 'UPVOTES': submission.score, + 'FLAIR': submission.link_flair_text, + 'DATE': submission.created_utc } - result = format_string - for key in submission_attributes.keys(): - if re.search(r'(?i).*{{{}}}.*'.format(key), result): - result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result) - logger.log(9, f'Found key string {key} in name') - + result = format_string.format(**submission_attributes) result = result.replace('/', '') if platform.system() == 'Windows': From a32dd6d0fe027bfc7f423e7d291d5eed40c54df8 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 27 Mar 2021 19:31:34 +0300 Subject: [PATCH 197/276] Use regex to strip id --- bulkredditdownloader/file_name_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 5c5144a..b092242 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -62,7 +62,7 @@ class FileNameFormatter: def _limit_file_name_length(filename: str, ending: str) -> str: possible_id = re.search(r'((?:_\w{6})?$)', filename).group(1) ending = possible_id + ending - filename = filename.strip(possible_id) + filename = re.sub(rf"^{possible_id}|{possible_id}$", "", filename) max_length_chars = 255 - len(ending) max_length_bytes = 255 - len(ending.encode('utf-8')) while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes: From 0fe28deee9d1f8be8bd2e346ac4373270c20ec9e Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Tue, 30 Mar 2021 10:42:15 +0300 Subject: [PATCH 198/276] file_name_formatter: added tests (#227) --- bulkredditdownloader/tests/test_file_name_formatter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 125bea1..41e9d40 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -159,6 +159,7 @@ def test_limit_filename_length(test_filename: str, test_ending: str): @pytest.mark.parametrize(('test_filename', 'test_ending', 'expected_end'), ( ('test_aaaaaa', '_1.png', 'test_aaaaaa_1.png'), + ('test_aataaa', '_1.png', 'test_aataaa_1.png'), ('test_aaaaaa', '.png', 'test_aaaaaa.png'), ('test', '_1.png', 'test_1.png'), ('test_m1hqw6', '_1.png', 'test_m1hqw6_1.png'), From f06e8f3ac44952cf3a7168ad694f422e824ced99 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Tue, 30 Mar 2021 10:43:32 +0300 Subject: [PATCH 199/276] Revert "Use .format() instead of regular expression" This reverts commit 8e8225283214927b461b71c247d2bcf8adcf4b34. --- bulkredditdownloader/file_name_formatter.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index b092242..c7be757 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -27,15 +27,20 @@ class FileNameFormatter: @staticmethod def _format_name(submission: praw.models.Submission, format_string: str) -> str: submission_attributes = { - 'TITLE': submission.title, - 'SUBREDDIT': submission.subreddit.display_name, - 'REDDITOR': submission.author.name if submission.author else 'DELETED', - 'POSTID': submission.id, - 'UPVOTES': submission.score, - 'FLAIR': submission.link_flair_text, - 'DATE': submission.created_utc + 'title': submission.title, + 'subreddit': submission.subreddit.display_name, + 'redditor': submission.author.name if submission.author else 'DELETED', + 'postid': submission.id, + 'upvotes': submission.score, + 'flair': submission.link_flair_text, + 'date': submission.created_utc } - result = format_string.format(**submission_attributes) + result = format_string + for key in submission_attributes.keys(): + if re.search(r'(?i).*{{{}}}.*'.format(key), result): + result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result) + logger.log(9, f'Found key string {key} in name') + result = result.replace('/', '') if platform.system() == 'Windows': From a1703567387928a47c4ef5850b7091c2a4a31ba5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 30 Mar 2021 18:22:11 +1000 Subject: [PATCH 200/276] Use slice to shorten name --- bulkredditdownloader/file_name_formatter.py | 7 ++++--- bulkredditdownloader/tests/test_file_name_formatter.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index c7be757..cd42e08 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -65,9 +65,10 @@ class FileNameFormatter: @staticmethod def _limit_file_name_length(filename: str, ending: str) -> str: - possible_id = re.search(r'((?:_\w{6})?$)', filename).group(1) - ending = possible_id + ending - filename = re.sub(rf"^{possible_id}|{possible_id}$", "", filename) + possible_id = re.search(r'((?:_\w{6})?$)', filename) + if possible_id: + ending = possible_id.group(1) + ending + filename = filename[:possible_id.start()] max_length_chars = 255 - len(ending) max_length_bytes = 255 - len(ending.encode('utf-8')) while len(filename) > max_length_chars or len(filename.encode('utf-8')) > max_length_bytes: diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 41e9d40..db8e61b 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -160,6 +160,7 @@ def test_limit_filename_length(test_filename: str, test_ending: str): @pytest.mark.parametrize(('test_filename', 'test_ending', 'expected_end'), ( ('test_aaaaaa', '_1.png', 'test_aaaaaa_1.png'), ('test_aataaa', '_1.png', 'test_aataaa_1.png'), + ('test_abcdef', '_1.png', 'test_abcdef_1.png'), ('test_aaaaaa', '.png', 'test_aaaaaa.png'), ('test', '_1.png', 'test_1.png'), ('test_m1hqw6', '_1.png', 'test_m1hqw6_1.png'), From 7d69c9a7afb6644cf35b28792cda7feef6ff0112 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 31 Mar 2021 11:07:02 +1000 Subject: [PATCH 201/276] Fix typos --- bulkredditdownloader/tests/downloaders/test_gfycat.py | 2 +- bulkredditdownloader/tests/downloaders/test_redgifs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_gfycat.py b/bulkredditdownloader/tests/downloaders/test_gfycat.py index a1b2a6d..5babe8c 100644 --- a/bulkredditdownloader/tests/downloaders/test_gfycat.py +++ b/bulkredditdownloader/tests/downloaders/test_gfycat.py @@ -26,7 +26,7 @@ def test_get_link(test_url: str, expected_url: str): ('https://gfycat.com/dazzlingsilkyiguana', '808941b48fc1e28713d36dd7ed9dc648'), )) def test_download_resource(test_url: str, expected_hash: str): - mock_submission = Mock + mock_submission = Mock() mock_submission.url = test_url test_site = Gfycat(mock_submission) resources = test_site.find_resources() diff --git a/bulkredditdownloader/tests/downloaders/test_redgifs.py b/bulkredditdownloader/tests/downloaders/test_redgifs.py index 4c330d1..2aa4227 100644 --- a/bulkredditdownloader/tests/downloaders/test_redgifs.py +++ b/bulkredditdownloader/tests/downloaders/test_redgifs.py @@ -27,7 +27,7 @@ def test_get_link(test_url: str, expected: str): ('https://redgifs.com/watch/springgreendecisivetaruca', '8dac487ac49a1f18cc1b4dabe23f0869'), )) def test_download_resource(test_url: str, expected_hash: str): - mock_submission = Mock + mock_submission = Mock() mock_submission.url = test_url test_site = Redgifs(mock_submission) resources = test_site.find_resources() From ab9dea0347c5990162874fa07a3bed10629f756e Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Tue, 30 Mar 2021 17:35:13 +0300 Subject: [PATCH 202/276] test_download_factory.py: hardcode submission links --- .../downloaders/test_download_factory.py | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 6f7cba7..6fc6cfd 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -21,25 +21,24 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_submission_id', 'expected_class'), ( - ('lu8l8g', VReddit), - ('lu29zn', SelfPost), - ('lu2ykk', Direct), # Imgur direct link - ('luh2pd', Direct), # Reddit direct link - ('luo9eo', Imgur), # Imgur .gifv link - ('lumulo', Direct), # Imgur direct link gif - ('lui5t3', Imgur), - ('lu93m7', Gallery), - ('luf1nu', Gfycat), - ('luxmgx', Erome), - ('lupb4r', Youtube), - ('lul6l7', Redgifs), - ('luu376', GifDeliveryNetwork), - ('m2l5oo', Youtube), +@pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( + ('https://v.redd.it/9z1dnk3xr5k61', VReddit), + ('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life_in_anything_but_comfort/', SelfPost), + ('https://i.imgur.com/bZx1SJQ.jpg', Direct), + ('https://i.redd.it/affyv0axd5k61.png', Direct), + ('https://i.imgur.com/BuzvZwb.gifv', Imgur), + ('https://i.imgur.com/6fNdLst.gif', Direct), + ('https://imgur.com/a/MkxAzeg', Imgur), + ('https://www.reddit.com/gallery/lu93m7', Gallery), + ('https://gfycat.com/concretecheerfulfinwhale', Gfycat), + ('https://www.erome.com/a/NWGw0F09', Erome), + ('https://youtube.com/watch?v=Gv8Wz74FjVA', Youtube), + ('https://redgifs.com/watch/courageousimpeccablecanvasback', Redgifs), + ('https://www.gifdeliverynetwork.com/repulsivefinishedandalusianhorse', GifDeliveryNetwork), + ('https://youtu.be/DevfjHOhuFc', Youtube), )) -def test_factory_lever_good(test_submission_id: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): - submission = reddit_instance.submission(id=test_submission_id) - result = DownloadFactory.pull_lever(submission.url) +def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): + result = DownloadFactory.pull_lever(test_submission_url) assert result is expected_class From 75d74a536295253d643ef5a5df29014e022adc1d Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Wed, 31 Mar 2021 12:16:27 +0300 Subject: [PATCH 203/276] test_download_factory: remove outdated marks --- bulkredditdownloader/tests/downloaders/test_download_factory.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 6fc6cfd..3f648a1 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -19,8 +19,6 @@ from bulkredditdownloader.site_downloaders.vreddit import VReddit from bulkredditdownloader.site_downloaders.youtube import Youtube -@pytest.mark.online -@pytest.mark.reddit @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( ('https://v.redd.it/9z1dnk3xr5k61', VReddit), ('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life_in_anything_but_comfort/', SelfPost), From 32c9d6184cf55019c466e016fc769f8a897e82fb Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Thu, 1 Apr 2021 18:37:20 +1000 Subject: [PATCH 204/276] Archiver is smarter for comments (#242) * Add comment name generation to file name formatter * Refactor to reduce duplication * Refactor archive entry classes * Refactor archiver class a bit * Refactor method * Fix comment retrieval * Add comment-downloading to archiver * Update test * Update test --- bulkredditdownloader/archive_entry.py | 68 ---------------- .../archive_entry/__init__.py | 2 + .../archive_entry/base_archive_entry.py | 36 +++++++++ .../archive_entry/comment_archive_entry.py | 21 +++++ .../archive_entry/submission_archive_entry.py | 47 +++++++++++ bulkredditdownloader/archiver.py | 80 ++++++++++++------- bulkredditdownloader/file_name_formatter.py | 55 +++++++++---- .../tests/archive_entry/__init__.py | 2 + .../test_comment_archive_entry.py | 38 +++++++++ .../test_submission_archive_entry.py} | 12 +-- .../tests/downloaders/test_erome.py | 2 +- bulkredditdownloader/tests/test_archiver.py | 21 ++--- .../tests/test_file_name_formatter.py | 64 +++++++++++++-- .../tests/test_integration.py | 19 ++++- 14 files changed, 329 insertions(+), 138 deletions(-) delete mode 100644 bulkredditdownloader/archive_entry.py create mode 100644 bulkredditdownloader/archive_entry/__init__.py create mode 100644 bulkredditdownloader/archive_entry/base_archive_entry.py create mode 100644 bulkredditdownloader/archive_entry/comment_archive_entry.py create mode 100644 bulkredditdownloader/archive_entry/submission_archive_entry.py create mode 100644 bulkredditdownloader/tests/archive_entry/__init__.py create mode 100644 bulkredditdownloader/tests/archive_entry/test_comment_archive_entry.py rename bulkredditdownloader/tests/{test_archive_entry.py => archive_entry/test_submission_archive_entry.py} (65%) diff --git a/bulkredditdownloader/archive_entry.py b/bulkredditdownloader/archive_entry.py deleted file mode 100644 index c9668e7..0000000 --- a/bulkredditdownloader/archive_entry.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -# coding=utf-8 - -import logging - -import praw.models - -logger = logging.getLogger(__name__) - - -class ArchiveEntry: - def __init__(self, submission: praw.models.Submission): - self.submission = submission - self.comments: list[dict] = [] - self.post_details: dict = {} - - def compile(self) -> dict: - self._fill_entry() - out = self.post_details - out['comments'] = self.comments - return out - - def _fill_entry(self): - self._get_comments() - self._get_post_details() - - def _get_post_details(self): - self.post_details = { - 'title': self.submission.title, - 'name': self.submission.name, - 'url': self.submission.url, - 'selftext': self.submission.selftext, - 'score': self.submission.score, - 'upvote_ratio': self.submission.upvote_ratio, - 'permalink': self.submission.permalink, - 'id': self.submission.id, - 'author': self.submission.author.name if self.submission.author else 'DELETED', - 'link_flair_text': self.submission.link_flair_text, - 'num_comments': self.submission.num_comments, - 'over_18': self.submission.over_18, - 'created_utc': self.submission.created_utc, - } - - def _get_comments(self): - logger.debug(f'Retrieving full comment tree for submission {self.submission.id}') - self.submission.comments.replace_more(0) - for top_level_comment in self.submission.comments: - self.comments.append(self._convert_comment_to_dict(top_level_comment)) - - @staticmethod - def _convert_comment_to_dict(in_comment: praw.models.Comment) -> dict: - out_dict = { - 'author': in_comment.author.name if in_comment.author else 'DELETED', - 'id': in_comment.id, - 'score': in_comment.score, - 'subreddit': in_comment.subreddit.display_name, - 'submission': in_comment.submission.id, - 'stickied': in_comment.stickied, - 'body': in_comment.body, - 'is_submitter': in_comment.is_submitter, - 'created_utc': in_comment.created_utc, - 'parent_id': in_comment.parent_id, - 'replies': [], - } - in_comment.replies.replace_more(0) - for reply in in_comment.replies: - out_dict['replies'].append(ArchiveEntry._convert_comment_to_dict(reply)) - return out_dict diff --git a/bulkredditdownloader/archive_entry/__init__.py b/bulkredditdownloader/archive_entry/__init__.py new file mode 100644 index 0000000..d4c1799 --- /dev/null +++ b/bulkredditdownloader/archive_entry/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# coding=utf-8 diff --git a/bulkredditdownloader/archive_entry/base_archive_entry.py b/bulkredditdownloader/archive_entry/base_archive_entry.py new file mode 100644 index 0000000..775ed68 --- /dev/null +++ b/bulkredditdownloader/archive_entry/base_archive_entry.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +from abc import ABC, abstractmethod + +from praw.models import Comment, Submission + + +class BaseArchiveEntry(ABC): + def __init__(self, source: (Comment, Submission)): + self.source = source + self.post_details: dict = {} + + @abstractmethod + def compile(self) -> dict: + raise NotImplementedError + + @staticmethod + def _convert_comment_to_dict(in_comment: Comment) -> dict: + out_dict = { + 'author': in_comment.author.name if in_comment.author else 'DELETED', + 'id': in_comment.id, + 'score': in_comment.score, + 'subreddit': in_comment.subreddit.display_name, + 'submission': in_comment.submission.id, + 'stickied': in_comment.stickied, + 'body': in_comment.body, + 'is_submitter': in_comment.is_submitter, + 'created_utc': in_comment.created_utc, + 'parent_id': in_comment.parent_id, + 'replies': [], + } + in_comment.replies.replace_more(0) + for reply in in_comment.replies: + out_dict['replies'].append(BaseArchiveEntry._convert_comment_to_dict(reply)) + return out_dict diff --git a/bulkredditdownloader/archive_entry/comment_archive_entry.py b/bulkredditdownloader/archive_entry/comment_archive_entry.py new file mode 100644 index 0000000..51a0947 --- /dev/null +++ b/bulkredditdownloader/archive_entry/comment_archive_entry.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging + +import praw.models + +from bulkredditdownloader.archive_entry.base_archive_entry import BaseArchiveEntry + +logger = logging.getLogger(__name__) + + +class CommentArchiveEntry(BaseArchiveEntry): + def __init__(self, comment: praw.models.Comment): + super(CommentArchiveEntry, self).__init__(comment) + + def compile(self) -> dict: + self.source.refresh() + self.post_details = self._convert_comment_to_dict(self.source) + self.post_details['submission_title'] = self.source.submission.title + return self.post_details diff --git a/bulkredditdownloader/archive_entry/submission_archive_entry.py b/bulkredditdownloader/archive_entry/submission_archive_entry.py new file mode 100644 index 0000000..90da7bc --- /dev/null +++ b/bulkredditdownloader/archive_entry/submission_archive_entry.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import logging + +import praw.models + +from bulkredditdownloader.archive_entry.base_archive_entry import BaseArchiveEntry + +logger = logging.getLogger(__name__) + + +class SubmissionArchiveEntry(BaseArchiveEntry): + def __init__(self, submission: praw.models.Submission): + super(SubmissionArchiveEntry, self).__init__(submission) + + def compile(self) -> dict: + comments = self._get_comments() + self._get_post_details() + out = self.post_details + out['comments'] = comments + return out + + def _get_post_details(self): + self.post_details = { + 'title': self.source.title, + 'name': self.source.name, + 'url': self.source.url, + 'selftext': self.source.selftext, + 'score': self.source.score, + 'upvote_ratio': self.source.upvote_ratio, + 'permalink': self.source.permalink, + 'id': self.source.id, + 'author': self.source.author.name if self.source.author else 'DELETED', + 'link_flair_text': self.source.link_flair_text, + 'num_comments': self.source.num_comments, + 'over_18': self.source.over_18, + 'created_utc': self.source.created_utc, + } + + def _get_comments(self) -> list[dict]: + logger.debug(f'Retrieving full comment tree for submission {self.source.id}') + comments = [] + self.source.comments.replace_more(0) + for top_level_comment in self.source.comments: + comments.append(self._convert_comment_to_dict(top_level_comment)) + return comments diff --git a/bulkredditdownloader/archiver.py b/bulkredditdownloader/archiver.py index 0d0df66..db4ee92 100644 --- a/bulkredditdownloader/archiver.py +++ b/bulkredditdownloader/archiver.py @@ -3,12 +3,15 @@ import json import logging +import re import dict2xml import praw.models import yaml -from bulkredditdownloader.archive_entry import ArchiveEntry +from bulkredditdownloader.archive_entry.base_archive_entry import BaseArchiveEntry +from bulkredditdownloader.archive_entry.comment_archive_entry import CommentArchiveEntry +from bulkredditdownloader.archive_entry.submission_archive_entry import SubmissionArchiveEntry from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.downloader import RedditDownloader from bulkredditdownloader.exceptions import ArchiverError @@ -25,41 +28,60 @@ class Archiver(RedditDownloader): for generator in self.reddit_lists: for submission in generator: logger.debug(f'Attempting to archive submission {submission.id}') - self._write_submission(submission) + self._write_entry(submission) - def _write_submission(self, submission: praw.models.Submission): - archive_entry = ArchiveEntry(submission) + def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: + supplied_submissions = [] + for sub_id in self.args.link: + if len(sub_id) == 6: + supplied_submissions.append(self.reddit_instance.submission(id=sub_id)) + elif re.match(r'^\w{7}$', sub_id): + supplied_submissions.append(self.reddit_instance.comment(id=sub_id)) + else: + supplied_submissions.append(self.reddit_instance.submission(url=sub_id)) + return [supplied_submissions] + + @staticmethod + def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Comment)) -> BaseArchiveEntry: + if isinstance(praw_item, praw.models.Submission): + return SubmissionArchiveEntry(praw_item) + elif isinstance(praw_item, praw.models.Comment): + return CommentArchiveEntry(praw_item) + else: + raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}') + + def _write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)): + archive_entry = self._pull_lever_entry_factory(praw_item) if self.args.format == 'json': - self._write_submission_json(archive_entry) + self._write_entry_json(archive_entry) elif self.args.format == 'xml': - self._write_submission_xml(archive_entry) + self._write_entry_xml(archive_entry) elif self.args.format == 'yaml': - self._write_submission_yaml(archive_entry) + self._write_entry_yaml(archive_entry) else: raise ArchiverError(f'Unknown format {self.args.format} given') - logger.info(f'Record for submission {submission.id} written to disk') + logger.info(f'Record for entry item {praw_item.id} written to disk') - def _write_submission_json(self, entry: ArchiveEntry): - resource = Resource(entry.submission, '', '.json') + def _write_entry_json(self, entry: BaseArchiveEntry): + resource = Resource(entry.source, '', '.json') + content = json.dumps(entry.compile()) + self._write_content_to_disk(resource, content) + + def _write_entry_xml(self, entry: BaseArchiveEntry): + resource = Resource(entry.source, '', '.xml') + content = dict2xml.dict2xml(entry.compile(), wrap='root') + self._write_content_to_disk(resource, content) + + def _write_entry_yaml(self, entry: BaseArchiveEntry): + resource = Resource(entry.source, '', '.yaml') + content = yaml.dump(entry.compile()) + self._write_content_to_disk(resource, content) + + def _write_content_to_disk(self, resource: Resource, content: str): file_path = self.file_name_formatter.format_path(resource, self.download_directory) file_path.parent.mkdir(exist_ok=True, parents=True) with open(file_path, 'w') as file: - logger.debug(f'Writing submission {entry.submission.id} to file in JSON format at {file_path}') - json.dump(entry.compile(), file) - - def _write_submission_xml(self, entry: ArchiveEntry): - resource = Resource(entry.submission, '', '.xml') - file_path = self.file_name_formatter.format_path(resource, self.download_directory) - file_path.parent.mkdir(exist_ok=True, parents=True) - with open(file_path, 'w') as file: - logger.debug(f'Writing submission {entry.submission.id} to file in XML format at {file_path}') - xml_entry = dict2xml.dict2xml(entry.compile(), wrap='root') - file.write(xml_entry) - - def _write_submission_yaml(self, entry: ArchiveEntry): - resource = Resource(entry.submission, '', '.yaml') - file_path = self.file_name_formatter.format_path(resource, self.download_directory) - file_path.parent.mkdir(exist_ok=True, parents=True) - with open(file_path, 'w') as file: - logger.debug(f'Writing submission {entry.submission.id} to file in YAML format at {file_path}') - yaml.dump(entry.compile(), file) + logger.debug( + f'Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}' + f' format at {file_path}') + file.write(content) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index cd42e08..6eea2d0 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -7,7 +7,7 @@ import re from pathlib import Path from typing import Optional -import praw.models +from praw.models import Comment, Submission from bulkredditdownloader.exceptions import BulkDownloaderException from bulkredditdownloader.resource import Resource @@ -25,20 +25,17 @@ class FileNameFormatter: self.directory_format_string = directory_format_string @staticmethod - def _format_name(submission: praw.models.Submission, format_string: str) -> str: - submission_attributes = { - 'title': submission.title, - 'subreddit': submission.subreddit.display_name, - 'redditor': submission.author.name if submission.author else 'DELETED', - 'postid': submission.id, - 'upvotes': submission.score, - 'flair': submission.link_flair_text, - 'date': submission.created_utc - } + def _format_name(submission: (Comment, Submission), format_string: str) -> str: + if isinstance(submission, Submission): + attributes = FileNameFormatter._generate_name_dict_from_submission(submission) + elif isinstance(submission, Comment): + attributes = FileNameFormatter._generate_name_dict_from_comment(submission) + else: + raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}') result = format_string - for key in submission_attributes.keys(): + for key in attributes.keys(): if re.search(r'(?i).*{{{}}}.*'.format(key), result): - result = re.sub(r'(?i){{{}}}'.format(key), str(submission_attributes.get(key, 'unknown')), result) + result = re.sub(r'(?i){{{}}}'.format(key), str(attributes.get(key, 'unknown')), result) logger.log(9, f'Found key string {key} in name') result = result.replace('/', '') @@ -48,7 +45,37 @@ class FileNameFormatter: return result - def format_path(self, resource: Resource, destination_directory: Path, index: Optional[int] = None) -> Path: + @staticmethod + def _generate_name_dict_from_submission(submission: Submission) -> dict: + submission_attributes = { + 'title': submission.title, + 'subreddit': submission.subreddit.display_name, + 'redditor': submission.author.name if submission.author else 'DELETED', + 'postid': submission.id, + 'upvotes': submission.score, + 'flair': submission.link_flair_text, + 'date': submission.created_utc + } + return submission_attributes + + @staticmethod + def _generate_name_dict_from_comment(comment: Comment) -> dict: + comment_attributes = { + 'title': comment.submission.title, + 'subreddit': comment.subreddit.display_name, + 'redditor': comment.author.name if comment.author else 'DELETED', + 'postid': comment.id, + 'upvotes': comment.score, + 'flair': '', + 'date': comment.created_utc + } + return comment_attributes + + def format_path( + self, + resource: Resource, + destination_directory: Path, + index: Optional[int] = None) -> Path: subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) index = f'_{str(index)}' if index else '' if not resource.extension: diff --git a/bulkredditdownloader/tests/archive_entry/__init__.py b/bulkredditdownloader/tests/archive_entry/__init__.py new file mode 100644 index 0000000..d4c1799 --- /dev/null +++ b/bulkredditdownloader/tests/archive_entry/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# coding=utf-8 diff --git a/bulkredditdownloader/tests/archive_entry/test_comment_archive_entry.py b/bulkredditdownloader/tests/archive_entry/test_comment_archive_entry.py new file mode 100644 index 0000000..5e9ec69 --- /dev/null +++ b/bulkredditdownloader/tests/archive_entry/test_comment_archive_entry.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import praw +import pytest + +from bulkredditdownloader.archive_entry.comment_archive_entry import CommentArchiveEntry + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_comment_id', 'expected_dict'), ( + ('gstd4hk', { + 'author': 'james_pic', + 'subreddit': 'Python', + 'submission': 'mgi4op', + 'submission_title': '76% Faster CPython', + }), +)) +def test_get_comment_details(test_comment_id: str, expected_dict: dict, reddit_instance: praw.Reddit): + comment = reddit_instance.comment(id=test_comment_id) + test_entry = CommentArchiveEntry(comment) + result = test_entry.compile() + assert all([result.get(key) == expected_dict[key] for key in expected_dict.keys()]) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_comment_id', 'expected_min_comments'), ( + ('gstd4hk', 4), + ('gsvyste', 3), + ('gsxnvvb', 5), +)) +def test_get_comment_replies(test_comment_id: str, expected_min_comments: int, reddit_instance: praw.Reddit): + comment = reddit_instance.comment(id=test_comment_id) + test_entry = CommentArchiveEntry(comment) + result = test_entry.compile() + assert len(result.get('replies')) >= expected_min_comments diff --git a/bulkredditdownloader/tests/test_archive_entry.py b/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py similarity index 65% rename from bulkredditdownloader/tests/test_archive_entry.py rename to bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py index dba5732..2d99e81 100644 --- a/bulkredditdownloader/tests/test_archive_entry.py +++ b/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py @@ -4,7 +4,7 @@ import praw import pytest -from bulkredditdownloader.archive_entry import ArchiveEntry +from bulkredditdownloader.archive_entry.submission_archive_entry import SubmissionArchiveEntry @pytest.mark.online @@ -14,9 +14,9 @@ from bulkredditdownloader.archive_entry import ArchiveEntry )) def test_get_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) - test_archive_entry = ArchiveEntry(test_submission) - test_archive_entry._get_comments() - assert len(test_archive_entry.comments) >= min_comments + test_archive_entry = SubmissionArchiveEntry(test_submission) + results = test_archive_entry._get_comments() + assert len(results) >= min_comments @pytest.mark.online @@ -27,6 +27,6 @@ def test_get_comments(test_submission_id: str, min_comments: int, reddit_instanc )) def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) - test_archive_entry = ArchiveEntry(test_submission) + test_archive_entry = SubmissionArchiveEntry(test_submission) test_archive_entry._get_post_details() - assert all([test_archive_entry.post_details[key] == expected_dict[key] for key in expected_dict.keys()]) + assert all([test_archive_entry.post_details.get(key) == expected_dict[key] for key in expected_dict.keys()]) diff --git a/bulkredditdownloader/tests/downloaders/test_erome.py b/bulkredditdownloader/tests/downloaders/test_erome.py index 9500cf8..2d936c8 100644 --- a/bulkredditdownloader/tests/downloaders/test_erome.py +++ b/bulkredditdownloader/tests/downloaders/test_erome.py @@ -10,7 +10,7 @@ from bulkredditdownloader.site_downloaders.erome import Erome @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_urls'), ( - ('https://www.erome.com/a/vqtPuLXh', ('https://s6.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',)), + ('https://www.erome.com/a/vqtPuLXh', ('https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',)), ('https://www.erome.com/a/ORhX0FZz', ('https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', diff --git a/bulkredditdownloader/tests/test_archiver.py b/bulkredditdownloader/tests/test_archiver.py index a2da7c5..b0a84c6 100644 --- a/bulkredditdownloader/tests/test_archiver.py +++ b/bulkredditdownloader/tests/test_archiver.py @@ -7,7 +7,7 @@ from unittest.mock import MagicMock import praw import pytest -from bulkredditdownloader.archive_entry import ArchiveEntry +from bulkredditdownloader.archive_entry.submission_archive_entry import SubmissionArchiveEntry from bulkredditdownloader.archiver import Archiver @@ -21,9 +21,9 @@ def test_write_submission_json(test_submission_id: str, tmp_path: Path, reddit_i test_path = Path(tmp_path, 'test.json') test_submission = reddit_instance.submission(id=test_submission_id) archiver_mock.file_name_formatter.format_path.return_value = test_path - test_entry = ArchiveEntry(test_submission) - Archiver._write_submission_json(archiver_mock, test_entry) - assert test_path.exists() + test_entry = SubmissionArchiveEntry(test_submission) + Archiver._write_entry_json(archiver_mock, test_entry) + archiver_mock._write_content_to_disk.assert_called_once() @pytest.mark.online @@ -36,9 +36,9 @@ def test_write_submission_xml(test_submission_id: str, tmp_path: Path, reddit_in test_path = Path(tmp_path, 'test.xml') test_submission = reddit_instance.submission(id=test_submission_id) archiver_mock.file_name_formatter.format_path.return_value = test_path - test_entry = ArchiveEntry(test_submission) - Archiver._write_submission_xml(archiver_mock, test_entry) - assert test_path.exists() + test_entry = SubmissionArchiveEntry(test_submission) + Archiver._write_entry_xml(archiver_mock, test_entry) + archiver_mock._write_content_to_disk.assert_called_once() @pytest.mark.online @@ -48,9 +48,10 @@ def test_write_submission_xml(test_submission_id: str, tmp_path: Path, reddit_in )) def test_write_submission_yaml(test_submission_id: str, tmp_path: Path, reddit_instance: praw.Reddit): archiver_mock = MagicMock() + archiver_mock.download_directory = tmp_path test_path = Path(tmp_path, 'test.yaml') test_submission = reddit_instance.submission(id=test_submission_id) archiver_mock.file_name_formatter.format_path.return_value = test_path - test_entry = ArchiveEntry(test_submission) - Archiver._write_submission_yaml(archiver_mock, test_entry) - assert test_path.exists() + test_entry = SubmissionArchiveEntry(test_submission) + Archiver._write_entry_yaml(archiver_mock, test_entry) + archiver_mock._write_content_to_disk.assert_called_once() diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index db8e61b..35becab 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -22,11 +22,12 @@ def submission() -> MagicMock: test.score = 1000 test.link_flair_text = 'test_flair' test.created_utc = 123456789 + test.__class__ = praw.models.Submission return test @pytest.fixture() -def reddit_submission(reddit_instance) -> praw.models.Submission: +def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: return reddit_instance.submission(id='lgilgt') @@ -137,6 +138,7 @@ def test_format_multiple_resources(): new_mock.url = 'https://example.com/test.png' new_mock.extension = '.png' new_mock.source_submission.title = 'test' + new_mock.source_submission.__class__ = praw.models.Submission mocks.append(new_mock) test_formatter = FileNameFormatter('{TITLE}', '') results = test_formatter.format_resource_paths(mocks, Path('.')) @@ -176,13 +178,12 @@ def test_preserve_id_append_when_shortening(test_filename: str, test_ending: str assert result.endswith(expected_end) -def test_shorten_filenames(tmp_path: Path): - test_submission = MagicMock() - test_submission.title = 'A' * 300 - test_submission.author.name = 'test' - test_submission.subreddit.display_name = 'test' - test_submission.id = 'BBBBBB' - test_resource = Resource(test_submission, 'www.example.com/empty', '.jpeg') +def test_shorten_filenames(submission: MagicMock, tmp_path: Path): + submission.title = 'A' * 300 + submission.author.name = 'test' + submission.subreddit.display_name = 'test' + submission.id = 'BBBBBB' + test_resource = Resource(submission, 'www.example.com/empty', '.jpeg') test_formatter = FileNameFormatter('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}') result = test_formatter.format_path(test_resource, tmp_path) result.parent.mkdir(parents=True) @@ -212,3 +213,50 @@ def test_format_file_name_for_windows(test_string: str, expected: str): def test_strip_emojies(test_string: str, expected: str): result = FileNameFormatter._strip_emojis(test_string) assert result == expected + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'expected'), ( + ('mfuteh', {'title': 'Why Do Interviewers Ask Linked List Questions?', 'redditor': 'mjgardner'}), +)) +def test_generate_dict_for_submission(test_submission_id: str, expected: dict, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + result = FileNameFormatter._generate_name_dict_from_submission(test_submission) + assert all([result.get(key) == expected[key] for key in expected.keys()]) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_comment_id', 'expected'), ( + ('gsq0yuw', { + 'title': 'Why Do Interviewers Ask Linked List Questions?', + 'redditor': 'Doctor-Dapper', + 'postid': 'gsq0yuw', + 'flair': '', + }), +)) +def test_generate_dict_for_comment(test_comment_id: str, expected: dict, reddit_instance: praw.Reddit): + test_comment = reddit_instance.comment(id=test_comment_id) + result = FileNameFormatter._generate_name_dict_from_comment(test_comment) + assert all([result.get(key) == expected[key] for key in expected.keys()]) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme', 'test_comment_id', 'expected_name'), ( + ('{POSTID}', '', 'gsoubde', 'gsoubde.json'), + ('{REDDITOR}_{POSTID}', '', 'gsoubde', 'DELETED_gsoubde.json'), +)) +def test_format_archive_entry_comment( + test_file_scheme: str, + test_folder_scheme: str, + test_comment_id: str, + expected_name: str, + tmp_path: Path, + reddit_instance: praw.Reddit): + test_comment = reddit_instance.comment(id=test_comment_id) + test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme) + test_entry = Resource(test_comment, '', '.json') + result = test_formatter.format_path(test_entry, tmp_path) + assert result.name == expected_name diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index a69a155..e41b1c1 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -168,6 +168,21 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): assert result.exit_code == 0 +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'gstd4hk'], + ['-l', 'm2601g'], +)) +def test_cli_archive_single(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r'Writing entry .*? to file in .*? format', result.output) + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') @@ -184,7 +199,7 @@ def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert re.search(r'Writing submission .*? to file in .*? format', result.output) + assert re.search(r'Writing entry .*? to file in .*? format', result.output) @pytest.mark.online @@ -200,7 +215,7 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path): test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert re.search(r'Writing submission .*? to file in .*? format', result.output) + assert re.search(r'Writing entry .*? to file in .*? format', result.output) @pytest.mark.online From a49d87e154e8d4f15682295630628ef9d06c974b Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Fri, 2 Apr 2021 01:56:31 -0400 Subject: [PATCH 205/276] Add logic to handle mobile youtube (m.youtube) links. --- bulkredditdownloader/site_downloaders/download_factory.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 0431d30..81c020e 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -42,6 +42,8 @@ class DownloadFactory: return VReddit elif re.match(url_beginning + r'youtu\.?be', url): return Youtube + elif re.match(url_beginning + r'm.youtu\.?be', url): + return Youtube elif re.match(url_beginning + r'i\.redd\.it.*', url): return Direct elif re.match(url_beginning + r'.*/.*\.\w{3,4}$', url): From c9c864b71b2c2fffe02ad6d441bbd5f504942d62 Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Fri, 2 Apr 2021 11:51:00 -0400 Subject: [PATCH 206/276] Update regex to be more concise, add test for m.youtube. --- bulkredditdownloader/site_downloaders/download_factory.py | 4 +--- .../tests/downloaders/test_download_factory.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 81c020e..84a6382 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -40,9 +40,7 @@ class DownloadFactory: return SelfPost elif re.match(url_beginning + r'v\.redd\.it', url): return VReddit - elif re.match(url_beginning + r'youtu\.?be', url): - return Youtube - elif re.match(url_beginning + r'm.youtu\.?be', url): + elif re.match(url_beginning + r'(m\.)?youtu\.?be', url): return Youtube elif re.match(url_beginning + r'i\.redd\.it.*', url): return Direct diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/downloaders/test_download_factory.py index 3f648a1..65de2ea 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/downloaders/test_download_factory.py @@ -34,6 +34,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube ('https://redgifs.com/watch/courageousimpeccablecanvasback', Redgifs), ('https://www.gifdeliverynetwork.com/repulsivefinishedandalusianhorse', GifDeliveryNetwork), ('https://youtu.be/DevfjHOhuFc', Youtube), + ('https://m.youtube.com/watch?v=kr-FeojxzUM', Youtube), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) From a05fa1a9655fc1c9ac4a1f5a0513c54d1a92da11 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 3 Apr 2021 17:25:22 +1000 Subject: [PATCH 207/276] Add some to CONTRIBUTING --- docs/CONTRIBUTING.md | 53 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index dbbe8e2..8cec165 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -14,7 +14,7 @@ Once you have done both of these, the below list shows the path that should be f 1. If an issue does not already exist, open one that will relate to the PR. 2. Ensure that any changes fit into the architecture specified above. - 3. Ensure that you have written tests that cover the new code + 3. Ensure that you have written tests that cover the new code. 4. Ensure that no existing tests fail, unless there is a good reason for them to do so. If there is, note why in the PR. 5. If needed, update any documentation with changes. 6. Open a pull request that references the relevant issue. @@ -27,3 +27,54 @@ Someone will review your pull request as soon as possible, but remember that all The BDFR must conform to PEP8 standard wherever there is Python code, with one exception. Line lengths may extend to 120 characters, but all other PEP8 standards must be followed. It's easy to format your code without any manual work via a variety of tools. Autopep8 is a good one, and can be used with `autopep8 --max-line-length 120` which will format the code according to the style in use with the BDFR. + +Hanging brackets are preferred when there are many items, items that otherwise go over the 120 character line limit, or when doing so would increase readability. It is also preferred when there might be many commits altering the list, such as with the parameter lists for tests. A hanging comma is also required in such cases. An example of this is below: + +```python +test = [ + 'test 1', + 'test 2', + 'test 3', +] +``` + +Note that the last bracket is on its own line, and that the first bracket has a new line before the first term. Also note that there is a comma after the last term. + +## Tests + +### Running Tests + +There are a lot of tests in the BDFR. When submitting a PR, it is required that you run **all** possible tests to ensure that any new commits haven't broken anything. Otherwise, while writing the development, it can be helpful (and much quicker) to run only a subset of the tests. + +This is accomplished with marks, a system that pytest uses to categorise tests. There are currently the current marks in use in the BDFR test suite. + +- `slow` + - This marks a test that may take a long time to complete +- `online` + - This marks a test that requires an internet connection and uses online resources +- `reddit` + - This marks a test that accesses online Reddit specifically +- `authenticated` + - This marks a test that requires a test configuration file with a valid OAuth2 token + +These tests can be run either all at once, or excluding certain marks. The tests that require online resources, such as those marked `reddit` or `online`, will naturally require more time to run than tests that are entirely offline. To run tests, you must be in the root directory of the project and can use the following command. + +```bash +pytest +``` + +To exclude one or more marks, the following command can be used, substituting the unwanted mark. + +```bash +pytest -m "not online" +``` + +Many IDEs also provide integrated methods to run and display the results from tests, and almost all of them support pytest in some capacity. This would be the recommended method due to the additional debugging and general capabilities. + +### Writing Tests + +When writing tests, ensure that they follow the style guide. The BDFR uses pytest to run tests. Wherever possible, parameterise tests, even if you only have one test case. This makes it easier to expand in the future, as the ultimate goal is to have multiple test cases for every test, instead of just one. + +If required, use of mocks is expected to simplify tests and reduce the resources or complexity required. Tests should be as small as possible and test as small a part of the code as possible. Comprehensive or integration tests are run with the `click` framework and are located in their own file. + +It is also expected that new tests be classified correctly with the marks described above i.e. if a test accesses Reddit through a `reddit_instance` object, it must be given the `reddit` mark. If it requires an authenticated Reddit instance, then it must have the `authenticated` mark. From 2385867afb8937bdfaa3a54d1f40d9782d1e863b Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sun, 4 Apr 2021 03:25:58 +1000 Subject: [PATCH 208/276] Refactor and optimise some tests (#245) * Rename to follow actual directory name * Remove unnecessary Reddit call from test * Refactor test to reduce duplication * Parameterise some tests * Standardise formatting --- .../test_submission_archive_entry.py | 6 +- .../__init__.py | 0 .../test_direct.py | 0 .../test_download_factory.py | 3 +- .../test_erome.py | 30 +++++--- .../test_gallery.py | 20 +++-- .../test_gfycat.py | 0 .../test_gif_delivery_network.py | 0 .../test_imgur.py | 38 +++++---- .../test_redgifs.py | 0 .../test_self_post.py | 0 .../test_vreddit.py | 0 .../test_youtube.py | 15 ++-- .../tests/test_configuration.py | 5 +- .../tests/test_download_filter.py | 48 ++++++------ bulkredditdownloader/tests/test_downloader.py | 63 ++++++++------- .../tests/test_file_name_formatter.py | 77 +++++++++++-------- bulkredditdownloader/tests/test_oauth2.py | 12 +-- bulkredditdownloader/tests/test_resource.py | 5 +- 19 files changed, 189 insertions(+), 133 deletions(-) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/__init__.py (100%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_direct.py (100%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_download_factory.py (97%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_erome.py (62%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_gallery.py (83%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_gfycat.py (100%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_gif_delivery_network.py (100%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_imgur.py (82%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_redgifs.py (100%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_self_post.py (100%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_vreddit.py (100%) rename bulkredditdownloader/tests/{downloaders => site_downloaders}/test_youtube.py (53%) diff --git a/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py b/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py index 2d99e81..6c72702 100644 --- a/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py +++ b/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py @@ -22,7 +22,11 @@ def test_get_comments(test_submission_id: str, min_comments: int, reddit_instanc @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected_dict'), ( - ('m3reby', {'author': 'sinjen-tos', 'id': 'm3reby', 'link_flair_text': 'image'}), + ('m3reby', { + 'author': 'sinjen-tos', + 'id': 'm3reby', + 'link_flair_text': 'image', + }), ('m3kua3', {'author': 'DELETED'}), )) def test_get_post_details(test_submission_id: str, expected_dict: dict, reddit_instance: praw.Reddit): diff --git a/bulkredditdownloader/tests/downloaders/__init__.py b/bulkredditdownloader/tests/site_downloaders/__init__.py similarity index 100% rename from bulkredditdownloader/tests/downloaders/__init__.py rename to bulkredditdownloader/tests/site_downloaders/__init__.py diff --git a/bulkredditdownloader/tests/downloaders/test_direct.py b/bulkredditdownloader/tests/site_downloaders/test_direct.py similarity index 100% rename from bulkredditdownloader/tests/downloaders/test_direct.py rename to bulkredditdownloader/tests/site_downloaders/test_direct.py diff --git a/bulkredditdownloader/tests/downloaders/test_download_factory.py b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py similarity index 97% rename from bulkredditdownloader/tests/downloaders/test_download_factory.py rename to bulkredditdownloader/tests/site_downloaders/test_download_factory.py index 65de2ea..62a1409 100644 --- a/bulkredditdownloader/tests/downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py @@ -21,7 +21,8 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( ('https://v.redd.it/9z1dnk3xr5k61', VReddit), - ('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life_in_anything_but_comfort/', SelfPost), + ('https://www.reddit.com/r/TwoXChromosomes/comments/lu29zn/i_refuse_to_live_my_life' + '_in_anything_but_comfort/', SelfPost), ('https://i.imgur.com/bZx1SJQ.jpg', Direct), ('https://i.redd.it/affyv0axd5k61.png', Direct), ('https://i.imgur.com/BuzvZwb.gifv', Imgur), diff --git a/bulkredditdownloader/tests/downloaders/test_erome.py b/bulkredditdownloader/tests/site_downloaders/test_erome.py similarity index 62% rename from bulkredditdownloader/tests/downloaders/test_erome.py rename to bulkredditdownloader/tests/site_downloaders/test_erome.py index 2d936c8..2fb7cf6 100644 --- a/bulkredditdownloader/tests/downloaders/test_erome.py +++ b/bulkredditdownloader/tests/site_downloaders/test_erome.py @@ -10,16 +10,18 @@ from bulkredditdownloader.site_downloaders.erome import Erome @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_urls'), ( - ('https://www.erome.com/a/vqtPuLXh', ('https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4',)), - ('https://www.erome.com/a/ORhX0FZz', - ('https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', - 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4') - ), + ('https://www.erome.com/a/vqtPuLXh', ( + 'https://s11.erome.com/365/vqtPuLXh/KH2qBT99_480p.mp4', + )), + ('https://www.erome.com/a/ORhX0FZz', ( + 'https://s4.erome.com/355/ORhX0FZz/9IYQocM9_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/9eEDc8xm_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/EvApC7Rp_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/LruobtMs_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/TJNmSUU5_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/X11Skh6Z_480p.mp4', + 'https://s4.erome.com/355/ORhX0FZz/bjlTkpn7_480p.mp4' + )), )) def test_get_link(test_url: str, expected_urls: tuple[str]): result = Erome. _get_links(test_url) @@ -29,8 +31,12 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hashes'), ( - ('https://www.erome.com/a/vqtPuLXh', {'5da2a8d60d87bed279431fdec8e7d72f'}), - ('https://www.erome.com/i/ItASD33e', {'b0d73fedc9ce6995c2f2c4fdb6f11eff'}), + ('https://www.erome.com/a/vqtPuLXh', { + '5da2a8d60d87bed279431fdec8e7d72f' + }), + ('https://www.erome.com/i/ItASD33e', { + 'b0d73fedc9ce6995c2f2c4fdb6f11eff' + }), ('https://www.erome.com/a/lGrcFxmb', { '0e98f9f527a911dcedde4f846bb5b69f', '25696ae364750a5303fc7d7dc78b35c1', diff --git a/bulkredditdownloader/tests/downloaders/test_gallery.py b/bulkredditdownloader/tests/site_downloaders/test_gallery.py similarity index 83% rename from bulkredditdownloader/tests/downloaders/test_gallery.py rename to bulkredditdownloader/tests/site_downloaders/test_gallery.py index 93326a0..d33c632 100644 --- a/bulkredditdownloader/tests/downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/site_downloaders/test_gallery.py @@ -38,14 +38,18 @@ def test_gallery_get_links(test_url: str, expected: set[str]): @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), ( - ('m6lvrh', {'6c8a892ae8066cbe119218bcaac731e1', - '93ce177f8cb7994906795f4615114d13', - '9a293adf19354f14582608cf22124574', - 'b73e2c3daee02f99404644ea02f1ae65'}), - ('ljyy27', {'1bc38bed88f9c4770e22a37122d5c941', - '2539a92b78f3968a069df2dffe2279f9', - '37dea50281c219b905e46edeefc1a18d', - 'ec4924cf40549728dcf53dd40bc7a73c'}), + ('m6lvrh', { + '6c8a892ae8066cbe119218bcaac731e1', + '93ce177f8cb7994906795f4615114d13', + '9a293adf19354f14582608cf22124574', + 'b73e2c3daee02f99404644ea02f1ae65' + }), + ('ljyy27', { + '1bc38bed88f9c4770e22a37122d5c941', + '2539a92b78f3968a069df2dffe2279f9', + '37dea50281c219b905e46edeefc1a18d', + 'ec4924cf40549728dcf53dd40bc7a73c' + }), )) def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) diff --git a/bulkredditdownloader/tests/downloaders/test_gfycat.py b/bulkredditdownloader/tests/site_downloaders/test_gfycat.py similarity index 100% rename from bulkredditdownloader/tests/downloaders/test_gfycat.py rename to bulkredditdownloader/tests/site_downloaders/test_gfycat.py diff --git a/bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py b/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py similarity index 100% rename from bulkredditdownloader/tests/downloaders/test_gif_delivery_network.py rename to bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py diff --git a/bulkredditdownloader/tests/downloaders/test_imgur.py b/bulkredditdownloader/tests/site_downloaders/test_imgur.py similarity index 82% rename from bulkredditdownloader/tests/downloaders/test_imgur.py rename to bulkredditdownloader/tests/site_downloaders/test_imgur.py index 6645737..7e96740 100644 --- a/bulkredditdownloader/tests/downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/site_downloaders/test_imgur.py @@ -51,17 +51,24 @@ def test_get_data_gif(test_url: str, expected_image_dict: dict): assert all([result.get(key) == expected_image_dict[key] for key in expected_image_dict.keys()]) -@pytest.mark.parametrize('test_extension', - ('.gif', '.png', '.jpg', '.mp4') - ) +@pytest.mark.parametrize('test_extension', ( + '.gif', + '.png', + '.jpg', + '.mp4' +)) def test_imgur_extension_validation_good(test_extension: str): result = Imgur._validate_extension(test_extension) assert result == test_extension -@pytest.mark.parametrize('test_extension', - ('.jpeg', '.avi', 'bad', '.test', '.flac') - ) +@pytest.mark.parametrize('test_extension', ( + '.jpeg', + 'bad', + '.avi', + '.test', + '.flac' +)) def test_imgur_extension_validation_bad(test_extension: str): with pytest.raises(SiteDownloaderError): Imgur._validate_extension(test_extension) @@ -69,13 +76,18 @@ def test_imgur_extension_validation_bad(test_extension: str): @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_hashes'), ( - ('https://imgur.com/a/xWZsDDP', ('f551d6e6b0fef2ce909767338612e31b',)), - ('https://imgur.com/gallery/IjJJdlC', ('7227d4312a9779b74302724a0cfa9081',)), - ('https://imgur.com/a/dcc84Gt', - ('cf1158e1de5c3c8993461383b96610cf', - '28d6b791a2daef8aa363bf5a3198535d', - '248ef8f2a6d03eeb2a80d0123dbaf9b6', - '029c475ce01b58fdf1269d8771d33913')), + ('https://imgur.com/a/xWZsDDP', ( + 'f551d6e6b0fef2ce909767338612e31b', + )), + ('https://imgur.com/gallery/IjJJdlC', ( + '7227d4312a9779b74302724a0cfa9081', + )), + ('https://imgur.com/a/dcc84Gt', ( + 'cf1158e1de5c3c8993461383b96610cf', + '28d6b791a2daef8aa363bf5a3198535d', + '248ef8f2a6d03eeb2a80d0123dbaf9b6', + '029c475ce01b58fdf1269d8771d33913' + )), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() diff --git a/bulkredditdownloader/tests/downloaders/test_redgifs.py b/bulkredditdownloader/tests/site_downloaders/test_redgifs.py similarity index 100% rename from bulkredditdownloader/tests/downloaders/test_redgifs.py rename to bulkredditdownloader/tests/site_downloaders/test_redgifs.py diff --git a/bulkredditdownloader/tests/downloaders/test_self_post.py b/bulkredditdownloader/tests/site_downloaders/test_self_post.py similarity index 100% rename from bulkredditdownloader/tests/downloaders/test_self_post.py rename to bulkredditdownloader/tests/site_downloaders/test_self_post.py diff --git a/bulkredditdownloader/tests/downloaders/test_vreddit.py b/bulkredditdownloader/tests/site_downloaders/test_vreddit.py similarity index 100% rename from bulkredditdownloader/tests/downloaders/test_vreddit.py rename to bulkredditdownloader/tests/site_downloaders/test_vreddit.py diff --git a/bulkredditdownloader/tests/downloaders/test_youtube.py b/bulkredditdownloader/tests/site_downloaders/test_youtube.py similarity index 53% rename from bulkredditdownloader/tests/downloaders/test_youtube.py rename to bulkredditdownloader/tests/site_downloaders/test_youtube.py index 0b4e982..9689cd5 100644 --- a/bulkredditdownloader/tests/downloaders/test_youtube.py +++ b/bulkredditdownloader/tests/site_downloaders/test_youtube.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 # coding=utf-8 -import praw +from unittest.mock import MagicMock + import pytest from bulkredditdownloader.resource import Resource @@ -9,14 +10,14 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube @pytest.mark.online -@pytest.mark.reddit @pytest.mark.slow -@pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( - ('ltnoqp', '468136300a106c67f1463a7011a6db4a'), - ('m2l5oo', 'a70512f7782f13922258297bb12055d9'), +@pytest.mark.parametrize(('test_url', 'expected_hash'), ( + ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '3c79a62898028987f94161e0abccbddf'), + ('https://www.youtube.com/watch?v=m-tKnjFwleU', '61651cc6f53782af50030c0a7dd0b6f6'), )) -def test_find_resources(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): - test_submission = reddit_instance.submission(id=test_submission_id) +def test_find_resources(test_url: str, expected_hash: str): + test_submission = MagicMock() + test_submission.url = test_url downloader = Youtube(test_submission) resources = downloader.find_resources() assert len(resources) == 1 diff --git a/bulkredditdownloader/tests/test_configuration.py b/bulkredditdownloader/tests/test_configuration.py index 9905150..94697a3 100644 --- a/bulkredditdownloader/tests/test_configuration.py +++ b/bulkredditdownloader/tests/test_configuration.py @@ -10,7 +10,10 @@ from bulkredditdownloader.configuration import Configuration @pytest.mark.parametrize('arg_dict', ( {'directory': 'test_dir'}, - {'directory': 'test_dir', 'no_dupes': True}, + { + 'directory': 'test_dir', + 'no_dupes': True, + }, )) def test_process_click_context(arg_dict: dict): test_config = Configuration() diff --git a/bulkredditdownloader/tests/test_download_filter.py b/bulkredditdownloader/tests/test_download_filter.py index c8957a5..04ea169 100644 --- a/bulkredditdownloader/tests/test_download_filter.py +++ b/bulkredditdownloader/tests/test_download_filter.py @@ -11,43 +11,47 @@ def download_filter() -> DownloadFilter: return DownloadFilter(['mp4', 'mp3'], ['test.com', 'reddit.com']) -@pytest.mark.parametrize(('test_url', 'expected'), (('test.mp4', False), - ('test.avi', True), - ('test.random.mp3', False) - )) +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('test.mp4', False), + ('test.avi', True), + ('test.random.mp3', False), +)) def test_filter_extension(test_url: str, expected: bool, download_filter: DownloadFilter): result = download_filter._check_extension(test_url) assert result == expected -@pytest.mark.parametrize(('test_url', 'expected'), (('test.mp4', True), - ('http://reddit.com/test.mp4', False), - ('http://reddit.com/test.gif', False), - ('https://www.example.com/test.mp4', True), - ('https://www.example.com/test.png', True), - )) +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('test.mp4', True), + ('http://reddit.com/test.mp4', False), + ('http://reddit.com/test.gif', False), + ('https://www.example.com/test.mp4', True), + ('https://www.example.com/test.png', True), +)) def test_filter_domain(test_url: str, expected: bool, download_filter: DownloadFilter): result = download_filter._check_domain(test_url) assert result == expected -@pytest.mark.parametrize(('test_url', 'expected'), (('test.mp4', False), - ('test.gif', True), - ('https://www.example.com/test.mp4', False), - ('https://www.example.com/test.png', True), - ('http://reddit.com/test.mp4', False), - ('http://reddit.com/test.gif', False), - )) +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('test.mp4', False), + ('test.gif', True), + ('https://www.example.com/test.mp4', False), + ('https://www.example.com/test.png', True), + ('http://reddit.com/test.mp4', False), + ('http://reddit.com/test.gif', False), +)) def test_filter_all(test_url: str, expected: bool, download_filter: DownloadFilter): result = download_filter.check_url(test_url) assert result == expected -@pytest.mark.parametrize('test_url', ('test.mp3', - 'test.mp4', - 'http://reddit.com/test.mp4', - 't', - )) +@pytest.mark.parametrize('test_url', ( + 'test.mp3', + 'test.mp4', + 'http://reddit.com/test.mp4', + 't', +)) def test_filter_empty_filter(test_url: str): download_filter = DownloadFilter() result = download_filter.check_url(test_url) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index db97f7d..7b0f385 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # coding=utf-8 -import argparse import re from pathlib import Path from typing import Iterator @@ -15,7 +14,7 @@ from bulkredditdownloader.__main__ import setup_logging from bulkredditdownloader.configuration import Configuration from bulkredditdownloader.download_filter import DownloadFilter from bulkredditdownloader.downloader import RedditDownloader, RedditTypes -from bulkredditdownloader.exceptions import BulkDownloaderException, RedditAuthenticationError, RedditUserError +from bulkredditdownloader.exceptions import BulkDownloaderException from bulkredditdownloader.file_name_formatter import FileNameFormatter from bulkredditdownloader.site_authenticator import SiteAuthenticator @@ -53,7 +52,7 @@ def test_determine_directories(tmp_path: Path, downloader_mock: MagicMock): @pytest.mark.parametrize(('skip_extensions', 'skip_domains'), ( ([], []), - (['.test'], ['test.com']), + (['.test'], ['test.com'],), )) def test_create_download_filter(skip_extensions: list[str], skip_domains: list[str], downloader_mock: MagicMock): downloader_mock.args.skip = skip_extensions @@ -244,25 +243,18 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated -def test_get_user_upvoted(downloader_mock: MagicMock, authenticated_reddit_instance: praw.Reddit): +@pytest.mark.parametrize('test_flag', ( + 'upvoted', + 'saved', +)) +def test_get_user_authenticated_lists( + test_flag: str, + downloader_mock: MagicMock, + authenticated_reddit_instance: praw.Reddit, +): + downloader_mock.args.__dict__[test_flag] = True downloader_mock.reddit_instance = authenticated_reddit_instance downloader_mock.args.user = 'me' - downloader_mock.args.upvoted = True - downloader_mock.args.limit = 10 - downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot - downloader_mock.sort_filter = RedditTypes.SortType.HOT - RedditDownloader._resolve_user_name(downloader_mock) - results = RedditDownloader._get_user_data(downloader_mock) - assert_all_results_are_submissions(10, results) - - -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.authenticated -def test_get_user_saved(downloader_mock: MagicMock, authenticated_reddit_instance: praw.Reddit): - downloader_mock.reddit_instance = authenticated_reddit_instance - downloader_mock.args.user = 'me' - downloader_mock.args.saved = True downloader_mock.args.limit = 10 downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.sort_filter = RedditTypes.SortType.HOT @@ -299,7 +291,8 @@ def test_download_submission_file_exists( downloader_mock: MagicMock, reddit_instance: praw.Reddit, tmp_path: Path, - capsys: pytest.CaptureFixture): + capsys: pytest.CaptureFixture +): setup_logging(3) downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True @@ -317,11 +310,17 @@ def test_download_submission_file_exists( @pytest.mark.online @pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'test_hash'), ( + ('m1hqw6', 'a912af8905ae468e0121e9940f797ad7'), +)) def test_download_submission_hash_exists( + test_submission_id: str, + test_hash: str, downloader_mock: MagicMock, reddit_instance: praw.Reddit, tmp_path: Path, - capsys: pytest.CaptureFixture): + capsys: pytest.CaptureFixture +): setup_logging(3) downloader_mock.reddit_instance = reddit_instance downloader_mock.download_filter.check_url.return_value = True @@ -329,8 +328,8 @@ def test_download_submission_hash_exists( downloader_mock.args.no_dupes = True downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) downloader_mock.download_directory = tmp_path - downloader_mock.master_hash_list = {'a912af8905ae468e0121e9940f797ad7': None} - submission = downloader_mock.reddit_instance.submission(id='m1hqw6') + downloader_mock.master_hash_list = {test_hash: None} + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) RedditDownloader._download_submission(downloader_mock, submission) folder_contents = list(tmp_path.iterdir()) output = capsys.readouterr() @@ -373,15 +372,23 @@ def test_split_subreddit_entries(test_subreddit_entries: list[str], expected: se @pytest.mark.online @pytest.mark.reddit -def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_instance: praw.Reddit): +@pytest.mark.parametrize('test_submission_id', ( + 'm1hqw6', +)) +def test_mark_hard_link( + test_submission_id: str, + downloader_mock: MagicMock, + tmp_path: Path, + reddit_instance: praw.Reddit +): downloader_mock.reddit_instance = reddit_instance downloader_mock.args.make_hard_links = True downloader_mock.download_directory = tmp_path downloader_mock.args.folder_scheme = '' downloader_mock.args.file_scheme = '{POSTID}' downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) - submission = downloader_mock.reddit_instance.submission(id='m1hqw6') - original = Path(tmp_path, 'm1hqw6.png') + submission = downloader_mock.reddit_instance.submission(id=test_submission_id) + original = Path(tmp_path, f'{test_submission_id}.png') RedditDownloader._download_submission(downloader_mock, submission) assert original.exists() @@ -390,7 +397,7 @@ def test_mark_hard_link(downloader_mock: MagicMock, tmp_path: Path, reddit_insta downloader_mock.file_name_formatter = RedditDownloader._create_file_name_formatter(downloader_mock) RedditDownloader._download_submission(downloader_mock, submission) test_file_1_stats = original.stat() - test_file_2_inode = Path(tmp_path, 'test2_m1hqw6.png').stat().st_ino + test_file_2_inode = Path(tmp_path, f'test2_{test_submission_id}.png').stat().st_ino assert test_file_1_stats.st_nlink == 2 assert test_file_1_stats.st_ino == test_file_2_inode diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 35becab..2f6e9c6 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -31,15 +31,16 @@ def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: return reddit_instance.submission(id='lgilgt') -@pytest.mark.parametrize(('format_string', 'expected'), (('{SUBREDDIT}', 'randomreddit'), - ('{REDDITOR}', 'person'), - ('{POSTID}', '12345'), - ('{UPVOTES}', '1000'), - ('{FLAIR}', 'test_flair'), - ('{DATE}', '123456789'), - ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345'), - ('{RANDOM}', '{RANDOM}'), - )) +@pytest.mark.parametrize(('format_string', 'expected'), ( + ('{SUBREDDIT}', 'randomreddit'), + ('{REDDITOR}', 'person'), + ('{POSTID}', '12345'), + ('{UPVOTES}', '1000'), + ('{FLAIR}', 'test_flair'), + ('{DATE}', '123456789'), + ('{REDDITOR}_{TITLE}_{POSTID}', 'person_name_12345'), + ('{RANDOM}', '{RANDOM}'), +)) def test_format_name_mock(format_string: str, expected: str, submission: MagicMock): result = FileNameFormatter._format_name(submission, format_string) assert result == expected @@ -61,14 +62,14 @@ def test_check_format_string_validity(test_string: str, expected: bool): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('format_string', 'expected'), - (('{SUBREDDIT}', 'Mindustry'), - ('{REDDITOR}', 'Gamer_player_boi'), - ('{POSTID}', 'lgilgt'), - ('{FLAIR}', 'Art'), - ('{SUBREDDIT}_{TITLE}', 'Mindustry_Toxopid that is NOT humane >:('), - ('{REDDITOR}_{TITLE}_{POSTID}', 'Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt') - )) +@pytest.mark.parametrize(('format_string', 'expected'), ( + ('{SUBREDDIT}', 'Mindustry'), + ('{REDDITOR}', 'Gamer_player_boi'), + ('{POSTID}', 'lgilgt'), + ('{FLAIR}', 'Art'), + ('{SUBREDDIT}_{TITLE}', 'Mindustry_Toxopid that is NOT humane >:('), + ('{REDDITOR}_{TITLE}_{POSTID}', 'Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt') +)) def test_format_name_real(format_string: str, expected: str, reddit_submission: praw.models.Submission): result = FileNameFormatter._format_name(reddit_submission, format_string) assert result == expected @@ -76,13 +77,23 @@ def test_format_name_real(format_string: str, expected: str, reddit_submission: @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'expected'), - (('{SUBREDDIT}', '{POSTID}', 'test/Mindustry/lgilgt.png'), - ('{SUBREDDIT}', '{TITLE}_{POSTID}', - 'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt.png'), - ('{SUBREDDIT}', '{REDDITOR}_{TITLE}_{POSTID}', - 'test/Mindustry/Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt.png') - )) +@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'expected'), ( + ( + '{SUBREDDIT}', + '{POSTID}', + 'test/Mindustry/lgilgt.png', + ), + ( + '{SUBREDDIT}', + '{TITLE}_{POSTID}', + 'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt.png', + ), + ( + '{SUBREDDIT}', + '{REDDITOR}_{TITLE}_{POSTID}', + 'test/Mindustry/Gamer_player_boi_Toxopid that is NOT humane >:(_lgilgt.png', + ), +)) def test_format_full( format_string_directory: str, format_string_file: str, @@ -112,13 +123,12 @@ def test_format_full_conform( @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'index', 'expected'), - (('{SUBREDDIT}', '{POSTID}', None, 'test/Mindustry/lgilgt.png'), - ('{SUBREDDIT}', '{POSTID}', 1, 'test/Mindustry/lgilgt_1.png'), - ('{SUBREDDIT}', '{POSTID}', 2, 'test/Mindustry/lgilgt_2.png'), - ('{SUBREDDIT}', '{TITLE}_{POSTID}', 2, - 'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt_2.png'), - )) +@pytest.mark.parametrize(('format_string_directory', 'format_string_file', 'index', 'expected'), ( + ('{SUBREDDIT}', '{POSTID}', None, 'test/Mindustry/lgilgt.png'), + ('{SUBREDDIT}', '{POSTID}', 1, 'test/Mindustry/lgilgt_1.png'), + ('{SUBREDDIT}', '{POSTID}', 2, 'test/Mindustry/lgilgt_2.png'), + ('{SUBREDDIT}', '{TITLE}_{POSTID}', 2, 'test/Mindustry/Toxopid that is NOT humane >:(_lgilgt_2.png'), +)) def test_format_full_with_index_suffix( format_string_directory: str, format_string_file: str, @@ -218,7 +228,10 @@ def test_strip_emojies(test_string: str, expected: str): @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected'), ( - ('mfuteh', {'title': 'Why Do Interviewers Ask Linked List Questions?', 'redditor': 'mjgardner'}), + ('mfuteh', { + 'title': 'Why Do Interviewers Ask Linked List Questions?', + 'redditor': 'mjgardner', + }), )) def test_generate_dict_for_submission(test_submission_id: str, expected: dict, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) diff --git a/bulkredditdownloader/tests/test_oauth2.py b/bulkredditdownloader/tests/test_oauth2.py index ced7605..e8d71a0 100644 --- a/bulkredditdownloader/tests/test_oauth2.py +++ b/bulkredditdownloader/tests/test_oauth2.py @@ -21,12 +21,12 @@ def example_config() -> configparser.ConfigParser: @pytest.mark.online @pytest.mark.parametrize('test_scopes', ( - ('history',), - ('history', 'creddits'), - ('account', 'flair'), - ('*',), + {'history', }, + {'history', 'creddits'}, + {'account', 'flair'}, + {'*', }, )) -def test_check_scopes(test_scopes: list[str]): +def test_check_scopes(test_scopes: set[str]): OAuth2Authenticator._check_scopes(test_scopes) @@ -54,7 +54,7 @@ def test_check_scopes_bad(test_scopes: set[str]): def test_token_manager_read(example_config: configparser.ConfigParser): mock_authoriser = MagicMock() mock_authoriser.refresh_token = None - test_manager = OAuth2TokenManager(example_config, None) + test_manager = OAuth2TokenManager(example_config, MagicMock()) test_manager.pre_refresh_callback(mock_authoriser) assert mock_authoriser.refresh_token == example_config.get('DEFAULT', 'user_token') diff --git a/bulkredditdownloader/tests/test_resource.py b/bulkredditdownloader/tests/test_resource.py index d7b3898..c2647b3 100644 --- a/bulkredditdownloader/tests/test_resource.py +++ b/bulkredditdownloader/tests/test_resource.py @@ -2,6 +2,7 @@ # coding=utf-8 import pytest +from unittest.mock import MagicMock from bulkredditdownloader.resource import Resource @@ -16,7 +17,7 @@ from bulkredditdownloader.resource import Resource ('https://preview.redd.it/7zkmr1wqqih61.png?width=237&format=png&auto=webp&s=19de214e634cbcad99', '.png'), )) def test_resource_get_extension(test_url: str, expected: str): - test_resource = Resource(None, test_url) + test_resource = Resource(MagicMock(), test_url) result = test_resource._determine_extension() assert result == expected @@ -26,6 +27,6 @@ def test_resource_get_extension(test_url: str, expected: str): ('https://www.iana.org/_img/2013.1/iana-logo-header.svg', '426b3ac01d3584c820f3b7f5985d6623'), )) def test_download_online_resource(test_url: str, expected_hash: str): - test_resource = Resource(None, test_url) + test_resource = Resource(MagicMock(), test_url) test_resource.download() assert test_resource.hash.hexdigest() == expected_hash From 1e1dae8eaada6a8bb8ff17aef4a90c6ac493d44d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 4 Apr 2021 08:38:48 +1000 Subject: [PATCH 209/276] Alter some logging messages --- bulkredditdownloader/downloader.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index d1b75e8..f5de851 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -147,7 +147,7 @@ class RedditDownloader: self.config_location = cfg_path return else: - logger.error(f'Could not find config file at {self.args.config}, attempting to find elsewhere') + logger.warning(f'Could not find config file at {self.args.config}, attempting to find elsewhere') possible_paths = [Path('./config.cfg'), Path('./default_config.cfg'), Path(self.config_directory, 'config.cfg'), @@ -224,7 +224,7 @@ class RedditDownloader: logger.log(9, f'Resolved user to {self.args.user}') else: self.args.user = None - logger.error('To use "me" as a user, an authenticated Reddit instance must be used') + logger.warning('To use "me" as a user, an authenticated Reddit instance must be used') def _get_submissions_from_link(self) -> list[list[praw.models.Submission]]: supplied_submissions = [] @@ -277,7 +277,7 @@ class RedditDownloader: sort_function( self.reddit_instance.redditor(self.args.user).submissions, limit=self.args.limit)) if not self.authenticated and any((self.args.upvoted, self.args.saved)): - logger.error('Accessing user lists requires authentication') + logger.warning('Accessing user lists requires authentication') else: if self.args.upvoted: logger.debug(f'Retrieving upvoted posts of user {self.args.user}') @@ -287,7 +287,7 @@ class RedditDownloader: generators.append(self.reddit_instance.redditor(self.args.user).saved(limit=self.args.limit)) return generators else: - logger.error('A user must be supplied to download user data') + logger.warning('A user must be supplied to download user data') return [] else: return [] @@ -349,8 +349,8 @@ class RedditDownloader: try: content = downloader.find_resources(self.authenticator) - except errors.SiteDownloaderError: - logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}') + except errors.SiteDownloaderError as e: + logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}') return for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): @@ -358,9 +358,9 @@ class RedditDownloader: else: try: res.download() - except errors.BulkDownloaderException: + except errors.BulkDownloaderException as e: logger.error( - f'Failed to download resource {res.url} with downloader {downloader_class.__name__}') + f'Failed to download resource {res.url} with downloader {downloader_class.__name__}: {e}') return resource_hash = res.hash.hexdigest() destination.parent.mkdir(parents=True, exist_ok=True) @@ -401,7 +401,7 @@ class RedditDownloader: for id_file in self.args.exclude_id_file: id_file = Path(id_file).resolve().expanduser() if not id_file.exists(): - logger.error(f'ID exclusion file at {id_file} does not exist') + logger.warning(f'ID exclusion file at {id_file} does not exist') continue with open(id_file, 'r') as file: for line in file: From f78856315dea62a6ed6bcf5300e1b7beafcdd12e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 4 Apr 2021 09:20:22 +1000 Subject: [PATCH 210/276] Switch to f-string --- bulkredditdownloader/site_downloaders/download_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 84a6382..46e5c9b 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -47,4 +47,4 @@ class DownloadFactory: elif re.match(url_beginning + r'.*/.*\.\w{3,4}$', url): return Direct else: - raise NotADownloadableLinkError('No downloader module exists for url {}'.format(url)) + raise NotADownloadableLinkError(f'No downloader module exists for url {url}') From dc9d02a28cbdb07496111bc36dc808fe4a1cf19f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 4 Apr 2021 09:38:13 +1000 Subject: [PATCH 211/276] Name log file with run time --- bulkredditdownloader/downloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index f5de851..baee8f0 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -169,7 +169,10 @@ class RedditDownloader: def _create_file_logger(self): main_logger = logging.getLogger() - file_handler = logging.FileHandler(Path(self.config_directory, 'log_output.txt'), mode='w') + file_handler = logging.FileHandler( + Path(self.config_directory, f'log_output_{datetime.now().isoformat()}.txt'), + mode='w', + ) formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') file_handler.setFormatter(formatter) file_handler.setLevel(0) From 702295f1eaa975cc509b417607713f8a9dbde8a3 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sun, 4 Apr 2021 16:16:06 +1000 Subject: [PATCH 212/276] Allow multilevel folder schemes (#251) * Improve some formatting * Allow multilevel folder names --- bulkredditdownloader/file_name_formatter.py | 36 +++++++++++++------ .../tests/test_file_name_formatter.py | 25 +++++++++++-- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index 6eea2d0..aac4b48 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -16,13 +16,21 @@ logger = logging.getLogger(__name__) class FileNameFormatter: - key_terms = ('title', 'subreddit', 'redditor', 'postid', 'upvotes', 'flair', 'date') + key_terms = ( + 'date', + 'flair', + 'postid', + 'redditor', + 'subreddit', + 'title', + 'upvotes', + ) def __init__(self, file_format_string: str, directory_format_string: str): if not self.validate_string(file_format_string): raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string') self.file_format_string = file_format_string - self.directory_format_string = directory_format_string + self.directory_format_string: list[str] = directory_format_string.split('/') @staticmethod def _format_name(submission: (Comment, Submission), format_string: str) -> str: @@ -34,8 +42,8 @@ class FileNameFormatter: raise BulkDownloaderException(f'Cannot name object {type(submission).__name__}') result = format_string for key in attributes.keys(): - if re.search(r'(?i).*{{{}}}.*'.format(key), result): - result = re.sub(r'(?i){{{}}}'.format(key), str(attributes.get(key, 'unknown')), result) + if re.search(fr'(?i).*{{{key}}}.*', result): + result = re.sub(fr'(?i){{{key}}}', str(attributes.get(key, 'unknown')), result) logger.log(9, f'Found key string {key} in name') result = result.replace('/', '') @@ -67,7 +75,7 @@ class FileNameFormatter: 'postid': comment.id, 'upvotes': comment.score, 'flair': '', - 'date': comment.created_utc + 'date': comment.created_utc, } return comment_attributes @@ -75,8 +83,12 @@ class FileNameFormatter: self, resource: Resource, destination_directory: Path, - index: Optional[int] = None) -> Path: - subfolder = destination_directory / self._format_name(resource.source_submission, self.directory_format_string) + index: Optional[int] = None, + ) -> Path: + subfolder = Path( + destination_directory, + *[self._format_name(resource.source_submission, part) for part in self.directory_format_string] + ) index = f'_{str(index)}' if index else '' if not resource.extension: raise BulkDownloaderException(f'Resource from {resource.url} has no extension') @@ -102,8 +114,11 @@ class FileNameFormatter: filename = filename[:-1] return filename + ending - def format_resource_paths(self, resources: list[Resource], - destination_directory: Path) -> list[tuple[Path, Resource]]: + def format_resource_paths( + self, + resources: list[Resource], + destination_directory: Path, + ) -> list[tuple[Path, Resource]]: out = [] if len(resources) == 1: out.append((self.format_path(resources[0], destination_directory, None), resources[0])) @@ -121,7 +136,8 @@ class FileNameFormatter: if result: if 'POSTID' not in test_string: logger.warning( - f'Post ID not included in this file scheme, so file names are not guaranteed to be unique') + 'Some files might not be downloaded due to name conflicts as filenames are' + ' not guaranteed to be be unique without {POSTID}') return True else: return False diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index 2f6e9c6..bf8cee1 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -26,7 +26,7 @@ def submission() -> MagicMock: return test -@pytest.fixture() +@pytest.fixture(scope='session') def reddit_submission(reddit_instance: praw.Reddit) -> praw.models.Submission: return reddit_instance.submission(id='lgilgt') @@ -267,9 +267,30 @@ def test_format_archive_entry_comment( test_comment_id: str, expected_name: str, tmp_path: Path, - reddit_instance: praw.Reddit): + reddit_instance: praw.Reddit, +): test_comment = reddit_instance.comment(id=test_comment_id) test_formatter = FileNameFormatter(test_file_scheme, test_folder_scheme) test_entry = Resource(test_comment, '', '.json') result = test_formatter.format_path(test_entry, tmp_path) assert result.name == expected_name + + +@pytest.mark.parametrize(('test_folder_scheme', 'expected'), ( + ('{REDDITOR}/{SUBREDDIT}', 'person/randomreddit'), + ('{POSTID}/{SUBREDDIT}/{REDDITOR}', '12345/randomreddit/person'), +)) +def test_multilevel_folder_scheme( + test_folder_scheme: str, + expected: str, + tmp_path: Path, + submission: MagicMock, +): + test_formatter = FileNameFormatter('{POSTID}', test_folder_scheme) + test_resource = MagicMock() + test_resource.source_submission = submission + test_resource.extension = '.png' + result = test_formatter.format_path(test_resource, tmp_path) + result = result.relative_to(tmp_path) + assert str(result.parent) == expected + assert len(result.parents) == (len(expected.split('/')) + 1) From bf078cbaf897fb18efd9543faa2eec514ca46c10 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 14:47:39 +1000 Subject: [PATCH 213/276] Switch to rotating log files --- bulkredditdownloader/downloader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index baee8f0..365f230 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -5,6 +5,7 @@ import configparser import hashlib import importlib.resources import logging +import logging.handlers import os import re import shutil @@ -169,10 +170,14 @@ class RedditDownloader: def _create_file_logger(self): main_logger = logging.getLogger() - file_handler = logging.FileHandler( - Path(self.config_directory, f'log_output_{datetime.now().isoformat()}.txt'), - mode='w', + log_path = Path(self.config_directory, 'log_output.txt') + file_handler = logging.handlers.RotatingFileHandler( + log_path, + mode='a', + backupCount=10, ) + if log_path.exists(): + file_handler.doRollover() formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') file_handler.setFormatter(formatter) file_handler.setLevel(0) From 6c98829fd8ad5a8f1b7e28ff7d6b81daa958adc5 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 3 Apr 2021 19:41:00 +0300 Subject: [PATCH 214/276] test_imgur.py: Remove duplicate tests --- bulkredditdownloader/tests/site_downloaders/test_imgur.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bulkredditdownloader/tests/site_downloaders/test_imgur.py b/bulkredditdownloader/tests/site_downloaders/test_imgur.py index 7e96740..451fa62 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/site_downloaders/test_imgur.py @@ -42,9 +42,6 @@ def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_d ('https://i.imgur.com/dLk3FGY.gifv', {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True} ), - ('https://i.imgur.com/dLk3FGY.gifv', - {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4'} - ), )) def test_get_data_gif(test_url: str, expected_image_dict: dict): result = Imgur._get_data(test_url) From 4e35f0db2bbe429326000d14902dd31a46f0ee85 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 3 Apr 2021 19:41:56 +0300 Subject: [PATCH 215/276] test_download_factory: add a new case --- .../tests/site_downloaders/test_download_factory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py index 62a1409..be95656 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py @@ -25,6 +25,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube '_in_anything_but_comfort/', SelfPost), ('https://i.imgur.com/bZx1SJQ.jpg', Direct), ('https://i.redd.it/affyv0axd5k61.png', Direct), + ('https://imgur.com/3ls94yv.jpeg', Direct), ('https://i.imgur.com/BuzvZwb.gifv', Imgur), ('https://i.imgur.com/6fNdLst.gif', Direct), ('https://imgur.com/a/MkxAzeg', Imgur), From ab29e17511402f42e290866b726de98fca07926e Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 3 Apr 2021 19:44:53 +0300 Subject: [PATCH 216/276] download_factory.py: check if url has ext first --- .../site_downloaders/download_factory.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 46e5c9b..bd2a08a 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -22,7 +22,11 @@ class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: url_beginning = r'\s*(https?://(www\.)?)' - if re.match(url_beginning + r'erome\.com.*', url): + if re.match(url_beginning + r'i\.imgur.*\.gifv$', url): + return Imgur + elif re.match(url_beginning + r'.*/.*\.\w{3,4}$', url): + return Direct + elif re.match(url_beginning + r'erome\.com.*', url): return Erome elif re.match(url_beginning + r'reddit\.com/gallery/.*', url): return Gallery @@ -32,8 +36,6 @@ class DownloadFactory: return GifDeliveryNetwork elif re.match(url_beginning + r'imgur.*', url): return Imgur - elif re.match(url_beginning + r'i\.imgur.*\.gifv$', url): - return Imgur elif re.match(url_beginning + r'redgifs.com', url): return Redgifs elif re.match(url_beginning + r'reddit\.com/r/', url): @@ -44,7 +46,5 @@ class DownloadFactory: return Youtube elif re.match(url_beginning + r'i\.redd\.it.*', url): return Direct - elif re.match(url_beginning + r'.*/.*\.\w{3,4}$', url): - return Direct else: raise NotADownloadableLinkError(f'No downloader module exists for url {url}') From 4b49991cd63a6741ae34c816337f6b4640cdd9b1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 14:54:26 +1000 Subject: [PATCH 217/276] Correct logger message --- bulkredditdownloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 365f230..0a2d65c 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -352,7 +352,7 @@ class RedditDownloader: downloader = downloader_class(submission) logger.debug(f'Using {downloader_class.__name__} with url {submission.url}') except errors.NotADownloadableLinkError as e: - logger.error(f'Could not download submission {submission.name}: {e}') + logger.error(f'Could not download submission {submission.id}: {e}') return try: From c2c3d032056a18678bc554441b05d2a5eb93ee4e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 15:03:09 +1000 Subject: [PATCH 218/276] Add support for direct links with parameters --- bulkredditdownloader/site_downloaders/download_factory.py | 2 +- .../tests/site_downloaders/test_download_factory.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index bd2a08a..0bbd231 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -24,7 +24,7 @@ class DownloadFactory: url_beginning = r'\s*(https?://(www\.)?)' if re.match(url_beginning + r'i\.imgur.*\.gifv$', url): return Imgur - elif re.match(url_beginning + r'.*/.*\.\w{3,4}$', url): + elif re.match(url_beginning + r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', url): return Direct elif re.match(url_beginning + r'erome\.com.*', url): return Erome diff --git a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py index be95656..e6bb7f4 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py @@ -37,6 +37,8 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube ('https://www.gifdeliverynetwork.com/repulsivefinishedandalusianhorse', GifDeliveryNetwork), ('https://youtu.be/DevfjHOhuFc', Youtube), ('https://m.youtube.com/watch?v=kr-FeojxzUM', Youtube), + ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), + ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) From b5b163084f81722fc5379b867e32784d0f70f4c4 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 15:11:17 +1000 Subject: [PATCH 219/276] Add more informative errors when downloading Resource --- bulkredditdownloader/resource.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index af39554..dd0592f 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -33,27 +33,30 @@ class Resource: if response.status_code == 200: return response.content elif response.status_code in (301, 401, 403, 404): - logger.error(f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') - return None + raise BulkDownloaderException(f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') else: - raise requests.exceptions.ConnectionError - except requests.exceptions.ConnectionError: - logger.log(9, f'Error occured downloading resource, waiting {wait_time} seconds') + raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') + except requests.exceptions.ConnectionError as e: + logger.log(9, f'Error occured downloading resource, waiting {wait_time} seconds: {e}') time.sleep(wait_time) if wait_time < 300: return Resource.retry_download(url, wait_time + 60) else: logger.error(f'Max wait time exceeded for resource at url {url}') - return None + raise def download(self): if not self.content: - content = self.retry_download(self.url, 0) + try: + content = self.retry_download(self.url, 0) + except requests.exceptions.ConnectionError as e: + raise BulkDownloaderException(f'Could not download resource: {e}') + except BulkDownloaderException: + raise if content: self.content = content - self.create_hash() - else: - raise BulkDownloaderException('Could not download resource') + if not self.hash and self.content: + self.create_hash() def create_hash(self): self.hash = hashlib.md5(self.content) From 9880d46853f7cd6b9f1bb5f5a72f2ae699788bee Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 15:57:21 +1000 Subject: [PATCH 220/276] Fix quotes --- bulkredditdownloader/site_downloaders/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bulkredditdownloader/site_downloaders/youtube.py index c1db496..e8cd696 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bulkredditdownloader/site_downloaders/youtube.py @@ -22,9 +22,9 @@ class Youtube(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: ytdl_options = { - "format": "best", - "playlistend": 1, - "nooverwrites": True, + 'format': 'best', + 'playlistend': 1, + 'nooverwrites': True, } out = self._download_video(ytdl_options) return [out] From 936cbd4747c8ff2da0e8b4cd00944d8ffd1fdde7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 16:12:05 +1000 Subject: [PATCH 221/276] Update test hash --- bulkredditdownloader/tests/site_downloaders/test_youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/site_downloaders/test_youtube.py b/bulkredditdownloader/tests/site_downloaders/test_youtube.py index 9689cd5..8af08d0 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_youtube.py +++ b/bulkredditdownloader/tests/site_downloaders/test_youtube.py @@ -13,7 +13,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '3c79a62898028987f94161e0abccbddf'), - ('https://www.youtube.com/watch?v=m-tKnjFwleU', '61651cc6f53782af50030c0a7dd0b6f6'), + ('https://www.youtube.com/watch?v=m-tKnjFwleU', '30314930d853afff8ebc7d8c36a5b833'), )) def test_find_resources(test_url: str, expected_hash: str): test_submission = MagicMock() From 500cee4bae47cdd7802f1d3e7b0b63dc18e9ccce Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 17:20:49 +1000 Subject: [PATCH 222/276] Remove erroneous test case --- .../tests/site_downloaders/test_gif_delivery_network.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py b/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py index e4bec87..a4399d8 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py +++ b/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py @@ -15,8 +15,6 @@ from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDelive 'https://thumbs2.redgifs.com/RegalShoddyHorsechestnutleafminer.mp4'), ('https://www.gifdeliverynetwork.com/maturenexthippopotamus', 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), - ('https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4', - 'https://thumbs2.redgifs.com/MatureNextHippopotamus.mp4'), )) def test_get_link(test_url: str, expected: str): result = GifDeliveryNetwork._get_link(test_url) From 2384c03170c6eb8b6da7221e936c21b93093fe25 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 17:21:04 +1000 Subject: [PATCH 223/276] Refactor method to base class --- .../site_downloaders/base_downloader.py | 11 ++++++++++- bulkredditdownloader/site_downloaders/erome.py | 3 +-- bulkredditdownloader/site_downloaders/gallery.py | 11 +++++------ bulkredditdownloader/site_downloaders/gfycat.py | 10 ++-------- .../site_downloaders/gif_delivery_network.py | 9 ++------- bulkredditdownloader/site_downloaders/imgur.py | 8 ++------ bulkredditdownloader/site_downloaders/redgifs.py | 16 ++++++---------- 7 files changed, 28 insertions(+), 40 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index 458f3bc..9c44de6 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -5,10 +5,12 @@ import logging from abc import ABC, abstractmethod from typing import Optional +import requests from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator +from bulkredditdownloader.exceptions import ResourceNotFound from bulkredditdownloader.resource import Resource +from bulkredditdownloader.site_authenticator import SiteAuthenticator logger = logging.getLogger(__name__) @@ -22,3 +24,10 @@ class BaseDownloader(ABC): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: """Return list of all un-downloaded Resources from submission""" raise NotImplementedError + + @staticmethod + def get_link(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: + res = requests.get(url, cookies=cookies, headers=headers) + if res.status_code != 200: + raise ResourceNotFound(f'Server responded with {res.status_code} to {url}') + return res diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index d9b48a3..c452175 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -5,7 +5,6 @@ import re from typing import Optional import bs4 -import requests from praw.models import Submission from bulkredditdownloader.exceptions import NotADownloadableLinkError @@ -34,7 +33,7 @@ class Erome(BaseDownloader): @staticmethod def _get_links(url: str) -> set[str]: - page = requests.get(url) + page = Erome.get_link(url) soup = bs4.BeautifulSoup(page.text, 'html.parser') front_images = soup.find_all('img', attrs={'class': 'lasyload'}) out = [im.get('data-src') for im in front_images] diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 22afc76..bc9390f 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -5,7 +5,6 @@ import re from typing import Optional import bs4 -import requests from praw.models import Submission from bulkredditdownloader.exceptions import ResourceNotFound @@ -28,12 +27,12 @@ class Gallery(BaseDownloader): @staticmethod def _get_links(url: str) -> list[str]: - page = requests.get(url, headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" - " Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + resource_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } - ) + page = Gallery.get_link(url, headers=resource_headers) soup = bs4.BeautifulSoup(page.text, 'html.parser') links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index a5051ca..d54fcf6 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -4,7 +4,6 @@ import json import re from typing import Optional -import requests from bs4 import BeautifulSoup from praw.models import Submission @@ -22,19 +21,14 @@ class Gfycat(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - if re.match(r'\.(webm|mp4|gif)$', url): - return url - gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://gfycat.com/' + gfycat_id - response = requests.get(url) - page_source = response.text - + response = Gfycat.get_link(url) if 'gifdeliverynetwork' in response.url: return GifDeliveryNetwork._get_link(url) - soup = BeautifulSoup(page_source, 'html.parser') + soup = BeautifulSoup(response.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) out = json.loads(content.contents[0]).get('video').get('contentUrl') diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 15ee76f..878dcb6 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 -import re from typing import Optional -import requests from bs4 import BeautifulSoup from praw.models import Submission @@ -23,12 +21,9 @@ class GifDeliveryNetwork(BaseDownloader): @staticmethod def _get_link(url: str) -> str: - if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url): - return url + page = GifDeliveryNetwork.get_link(url) - page_source = requests.get(url).text - - soup = BeautifulSoup(page_source, 'html.parser') + soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) if content is None or content.get('src') is None: diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 9e311d6..4314db3 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -5,10 +5,9 @@ import re from typing import Optional import bs4 -import requests from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError, ResourceNotFound, SiteDownloaderError +from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -42,10 +41,7 @@ class Imgur(BaseDownloader): link = link.replace('i.imgur', 'imgur') link = link.rstrip('.gifv') - res = requests.get(link, cookies={'over18': '1', 'postpagebeta': '0'}) - - if res.status_code != 200: - raise ResourceNotFound(f'Server responded with {res.status_code} to {link}') + res = Imgur.get_link(link, cookies={'over18': '1', 'postpagebeta': '0'}) soup = bs4.BeautifulSoup(res.text, 'html.parser') scripts = soup.find_all('script', attrs={'type': 'text/javascript'}) diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index e4ee567..46adb8d 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -4,7 +4,6 @@ import json import re from typing import Optional -import requests from bs4 import BeautifulSoup from praw.models import Submission @@ -23,20 +22,17 @@ class Redgifs(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - if re.match(r'https://.*\.(mp4|webm|gif)(\?.*)?$', url): - return url - redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://redgifs.com/watch/' + redgif_id - headers = {'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64' - } + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', + } - page_source = requests.get(url, headers=headers).text + page = Redgifs.get_link(url, headers=headers) - soup = BeautifulSoup(page_source, 'html.parser') + soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) if content is None: From b2552710167555a8269697847f92c1eca543c490 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 21:32:39 +1000 Subject: [PATCH 224/276] Fix duplicate test name --- bulkredditdownloader/tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index e41b1c1..b38fbff 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -262,7 +262,7 @@ def test_cli_download_use_default_config(tmp_path: Path): @pytest.mark.parametrize('test_args', ( ['-l', 'm2601g', '--exclude-id', 'm2601g'], )) -def test_cli_download_links(test_args: list[str], tmp_path: Path): +def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): runner = CliRunner() test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) From 1768096b8505c09b02519831cc4166b6df415d1c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 5 Apr 2021 21:34:03 +1000 Subject: [PATCH 225/276] Add filter for stream logger re exceptions --- bulkredditdownloader/__main__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index a3574e1..bc4e4e7 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -87,10 +87,18 @@ def cli_archive(context: click.Context, **_): def setup_logging(verbosity: int): + class StreamExceptionFilter(logging.Filter): + def filter(self, record: logging.LogRecord) -> bool: + result = not (record.levelno == logging.ERROR and record.exc_info) + return result + logger.setLevel(1) stream = logging.StreamHandler(sys.stdout) + stream.addFilter(StreamExceptionFilter()) + formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') stream.setFormatter(formatter) + logger.addHandler(stream) if verbosity <= 0: stream.setLevel(logging.INFO) From 9cb4dd4cf3a32f78a0821eed6f7f012b8868a031 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 6 Apr 2021 10:48:21 +1000 Subject: [PATCH 226/276] Rename function --- bulkredditdownloader/site_downloaders/base_downloader.py | 2 +- bulkredditdownloader/site_downloaders/erome.py | 2 +- bulkredditdownloader/site_downloaders/gallery.py | 2 +- bulkredditdownloader/site_downloaders/gfycat.py | 2 +- bulkredditdownloader/site_downloaders/gif_delivery_network.py | 2 +- bulkredditdownloader/site_downloaders/imgur.py | 2 +- bulkredditdownloader/site_downloaders/redgifs.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bulkredditdownloader/site_downloaders/base_downloader.py index 9c44de6..85eee0b 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bulkredditdownloader/site_downloaders/base_downloader.py @@ -26,7 +26,7 @@ class BaseDownloader(ABC): raise NotImplementedError @staticmethod - def get_link(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: + def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: res = requests.get(url, cookies=cookies, headers=headers) if res.status_code != 200: raise ResourceNotFound(f'Server responded with {res.status_code} to {url}') diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index c452175..c223cd1 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -33,7 +33,7 @@ class Erome(BaseDownloader): @staticmethod def _get_links(url: str) -> set[str]: - page = Erome.get_link(url) + page = Erome.retrieve_url(url) soup = bs4.BeautifulSoup(page.text, 'html.parser') front_images = soup.find_all('img', attrs={'class': 'lasyload'}) out = [im.get('data-src') for im in front_images] diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index bc9390f..8d7c074 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -32,7 +32,7 @@ class Gallery(BaseDownloader): ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', } - page = Gallery.get_link(url, headers=resource_headers) + page = Gallery.retrieve_url(url, headers=resource_headers) soup = bs4.BeautifulSoup(page.text, 'html.parser') links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index d54fcf6..6d1c3c7 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -24,7 +24,7 @@ class Gfycat(GifDeliveryNetwork): gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1) url = 'https://gfycat.com/' + gfycat_id - response = Gfycat.get_link(url) + response = Gfycat.retrieve_url(url) if 'gifdeliverynetwork' in response.url: return GifDeliveryNetwork._get_link(url) diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 878dcb6..2d433d5 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -21,7 +21,7 @@ class GifDeliveryNetwork(BaseDownloader): @staticmethod def _get_link(url: str) -> str: - page = GifDeliveryNetwork.get_link(url) + page = GifDeliveryNetwork.retrieve_url(url) soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 4314db3..d72f66a 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -41,7 +41,7 @@ class Imgur(BaseDownloader): link = link.replace('i.imgur', 'imgur') link = link.rstrip('.gifv') - res = Imgur.get_link(link, cookies={'over18': '1', 'postpagebeta': '0'}) + res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'}) soup = bs4.BeautifulSoup(res.text, 'html.parser') scripts = soup.find_all('script', attrs={'type': 'text/javascript'}) diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 46adb8d..8f16447 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -30,7 +30,7 @@ class Redgifs(GifDeliveryNetwork): ' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', } - page = Redgifs.get_link(url, headers=headers) + page = Redgifs.retrieve_url(url, headers=headers) soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) From a291104144b716793a96faca119a530caba90b24 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 6 Apr 2021 11:04:08 +1000 Subject: [PATCH 227/276] Add defensive programming to site downloaders --- .../site_downloaders/erome.py | 5 +++-- .../site_downloaders/gallery.py | 4 ++-- .../site_downloaders/gfycat.py | 8 +++++++- .../site_downloaders/gif_delivery_network.py | 12 +++++++---- .../site_downloaders/imgur.py | 20 ++++++++++++------- .../site_downloaders/redgifs.py | 18 +++++++++++++---- 6 files changed, 47 insertions(+), 20 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bulkredditdownloader/site_downloaders/erome.py index c223cd1..ae896e2 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bulkredditdownloader/site_downloaders/erome.py @@ -7,7 +7,7 @@ from typing import Optional import bs4 from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -21,8 +21,9 @@ class Erome(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: links = self._get_links(self.post.url) + if not links: - raise NotADownloadableLinkError('Erome parser could not find any links') + raise SiteDownloaderError('Erome parser could not find any links') out = [] for link in links: diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bulkredditdownloader/site_downloaders/gallery.py index 8d7c074..829951c 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bulkredditdownloader/site_downloaders/gallery.py @@ -7,7 +7,7 @@ from typing import Optional import bs4 from praw.models import Submission -from bulkredditdownloader.exceptions import ResourceNotFound +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -22,7 +22,7 @@ class Gallery(BaseDownloader): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: image_urls = self._get_links(self.post.url) if not image_urls: - raise ResourceNotFound('No images found in Reddit gallery') + raise SiteDownloaderError('No images found in Reddit gallery') return [Resource(self.post, url) for url in image_urls] @staticmethod diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bulkredditdownloader/site_downloaders/gfycat.py index 6d1c3c7..62cee25 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bulkredditdownloader/site_downloaders/gfycat.py @@ -7,6 +7,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission +from bulkredditdownloader.exceptions import SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -31,5 +32,10 @@ class Gfycat(GifDeliveryNetwork): soup = BeautifulSoup(response.text, 'html.parser') content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) - out = json.loads(content.contents[0]).get('video').get('contentUrl') + try: + out = json.loads(content.contents[0])['video']['contentUrl'] + except (IndexError, KeyError) as e: + raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}') + except json.JSONDecodeError as e: + raise SiteDownloaderError(f'Did not receive valid JSON data: {e}') return out diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bulkredditdownloader/site_downloaders/gif_delivery_network.py index 2d433d5..31d5660 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bulkredditdownloader/site_downloaders/gif_delivery_network.py @@ -5,7 +5,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader @@ -26,7 +26,11 @@ class GifDeliveryNetwork(BaseDownloader): soup = BeautifulSoup(page.text, 'html.parser') content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) - if content is None or content.get('src') is None: - raise NotADownloadableLinkError('Could not read the page source') + try: + out = content['src'] + if not out: + raise KeyError + except KeyError: + raise SiteDownloaderError('Could not find source link') - return content.get('src') + return out diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index d72f66a..832729a 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -50,17 +50,23 @@ class Imgur(BaseDownloader): script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'') chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts)) if len(chosen_script) != 1: - raise NotADownloadableLinkError(f'Could not read page source from {link}') - else: - chosen_script = chosen_script[0] + raise SiteDownloaderError(f'Could not read page source from {link}') + + chosen_script = chosen_script[0] outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);') - image_dict = re.search(outer_regex, chosen_script).group(1) - inner_regex = re.compile(r'image\s*:(.*),\s*group') - image_dict = re.search(inner_regex, image_dict).group(1) + try: + image_dict = re.search(outer_regex, chosen_script).group(1) + image_dict = re.search(inner_regex, image_dict).group(1) + except AttributeError: + raise SiteDownloaderError(f'Could not find image dictionary in page source') + + try: + image_dict = json.loads(image_dict) + except json.JSONDecodeError as e: + raise SiteDownloaderError(f'Could not parse received dict as JSON: {e}') - image_dict = json.loads(image_dict) return image_dict @staticmethod diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bulkredditdownloader/site_downloaders/redgifs.py index 8f16447..536532e 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bulkredditdownloader/site_downloaders/redgifs.py @@ -7,7 +7,7 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError +from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError from bulkredditdownloader.resource import Resource from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork @@ -22,7 +22,11 @@ class Redgifs(GifDeliveryNetwork): @staticmethod def _get_link(url: str) -> str: - redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) + try: + redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) + except AttributeError: + raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}') + url = 'https://redgifs.com/watch/' + redgif_id headers = { @@ -36,7 +40,13 @@ class Redgifs(GifDeliveryNetwork): content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) if content is None: - raise NotADownloadableLinkError('Could not read the page source') + raise SiteDownloaderError('Could not read the page source') + + try: + out = json.loads(content.contents[0])['video']['contentUrl'] + except (IndexError, KeyError): + raise SiteDownloaderError('Failed to find JSON data in page') + except json.JSONDecodeError as e: + raise SiteDownloaderError(f'Received data was not valid JSON: {e}') - out = json.loads(content.contents[0])['video']['contentUrl'] return out From 5fea34ffcee380b11a3a37b5f5f73e899490828f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 6 Apr 2021 16:16:09 +1000 Subject: [PATCH 228/276] Fix test --- bulkredditdownloader/tests/test_downloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/test_downloader.py b/bulkredditdownloader/tests/test_downloader.py index 7b0f385..11cc759 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bulkredditdownloader/tests/test_downloader.py @@ -99,6 +99,7 @@ def test_create_sort_filter(test_sort: str, expected: str, downloader_mock: Magi ('{REDDITOR}_{TITLE}_{POSTID}', '{SUBREDDIT}'), ('{POSTID}', 'test'), ('{POSTID}', ''), + ('{POSTID}', '{SUBREDDIT}/{REDDITOR}'), )) def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: str, downloader_mock: MagicMock): downloader_mock.args.file_scheme = test_file_scheme @@ -107,7 +108,7 @@ def test_create_file_name_formatter(test_file_scheme: str, test_folder_scheme: s assert isinstance(result, FileNameFormatter) assert result.file_format_string == test_file_scheme - assert result.directory_format_string == test_folder_scheme + assert result.directory_format_string == test_folder_scheme.split('/') @pytest.mark.parametrize(('test_file_scheme', 'test_folder_scheme'), ( From 6704cd1dc0e4eaa3c37875bd51be36b010a74a19 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 6 Apr 2021 16:16:26 +1000 Subject: [PATCH 229/276] Set log backup count from config --- README.md | 3 +++ bulkredditdownloader/default_config.cfg | 3 ++- bulkredditdownloader/downloader.py | 9 +++------ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 605f1a2..daff766 100644 --- a/README.md +++ b/README.md @@ -198,12 +198,15 @@ The logging output for each run of the BDFR will be saved to this directory in t The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys **must** be included in the configuration file supplied. + - `backup_log_count` - `client_id` - `client_secret` - `scopes` All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default. +Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, increase this number. + ## Contributing If you wish to contribute, see [Contributing](docs/CONTRIBUTING.md) for more information. diff --git a/bulkredditdownloader/default_config.cfg b/bulkredditdownloader/default_config.cfg index f9a3f84..97d6bb9 100644 --- a/bulkredditdownloader/default_config.cfg +++ b/bulkredditdownloader/default_config.cfg @@ -1,4 +1,5 @@ [DEFAULT] client_id = U-6gk4ZCh3IeNQ client_secret = 7CZHY6AmKweZME5s50SfDGylaPg -scopes = identity, history, read, save \ No newline at end of file +scopes = identity, history, read, save +backup_log_count = 3 \ No newline at end of file diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 0a2d65c..81a06c1 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -67,6 +67,7 @@ class RedditDownloader: def _setup_internal_objects(self): self._determine_directories() + self._load_config() self._create_file_logger() self.download_filter = self._create_download_filter() @@ -78,8 +79,6 @@ class RedditDownloader: self.file_name_formatter = self._create_file_name_formatter() logger.log(9, 'Create file name formatter') - self._load_config() - logger.debug(f'Configuration loaded from {self.config_location}') self._create_reddit_instance() self._resolve_user_name() @@ -147,8 +146,6 @@ class RedditDownloader: self.cfg_parser.read(cfg_path) self.config_location = cfg_path return - else: - logger.warning(f'Could not find config file at {self.args.config}, attempting to find elsewhere') possible_paths = [Path('./config.cfg'), Path('./default_config.cfg'), Path(self.config_directory, 'config.cfg'), @@ -163,7 +160,6 @@ class RedditDownloader: if not self.config_location: self.config_location = list(importlib.resources.path('bulkredditdownloader', 'default_config.cfg').gen)[0] shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) - logger.debug('Copied default config file from module to config folder') if not self.config_location: raise errors.BulkDownloaderException('Could not find a configuration file to load') self.cfg_parser.read(self.config_location) @@ -171,10 +167,11 @@ class RedditDownloader: def _create_file_logger(self): main_logger = logging.getLogger() log_path = Path(self.config_directory, 'log_output.txt') + backup_count = self.cfg_parser.getint('DEFAULT', 'backup_log_count', fallback=3) file_handler = logging.handlers.RotatingFileHandler( log_path, mode='a', - backupCount=10, + backupCount=backup_count, ) if log_path.exists(): file_handler.doRollover() From 942ca2afea99d953e4611c12a91a2cf90a715618 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 6 Apr 2021 17:01:49 +1000 Subject: [PATCH 230/276] Update test --- bulkredditdownloader/tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index b38fbff..2991b68 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -283,4 +283,4 @@ def test_cli_file_scheme_warning(test_args: list[str], tmp_path: Path): test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert 'Post ID not included in this file scheme' in result.output + assert 'Some files might not be downloaded due to name conflicts' in result.output From 9bceafc3e9f004c9ac0f58e0557408736ece7b6c Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Tue, 6 Apr 2021 23:43:03 +1000 Subject: [PATCH 231/276] Parse unicode escapes in file name fields (#254) --- bulkredditdownloader/file_name_formatter.py | 4 +++- .../tests/test_file_name_formatter.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/file_name_formatter.py b/bulkredditdownloader/file_name_formatter.py index aac4b48..fcd5851 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bulkredditdownloader/file_name_formatter.py @@ -43,7 +43,9 @@ class FileNameFormatter: result = format_string for key in attributes.keys(): if re.search(fr'(?i).*{{{key}}}.*', result): - result = re.sub(fr'(?i){{{key}}}', str(attributes.get(key, 'unknown')), result) + key_value = attributes.get(key, 'unknown') + key_value = bytes(key_value, 'utf-8').decode('unicode-escape') + result = re.sub(fr'(?i){{{key}}}', key_value, result,) logger.log(9, f'Found key string {key} in name') result = result.replace('/', '') diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bulkredditdownloader/tests/test_file_name_formatter.py index bf8cee1..a3506e2 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bulkredditdownloader/tests/test_file_name_formatter.py @@ -294,3 +294,14 @@ def test_multilevel_folder_scheme( result = result.relative_to(tmp_path) assert str(result.parent) == expected assert len(result.parents) == (len(expected.split('/')) + 1) + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(('test_submission_id', 'test_file_scheme', 'expected'), ( + ('mecwk7', '{TITLE}', 'My cat’s paws are so cute'), # Unicode escape in title +)) +def test_edge_case_names(test_submission_id: str, test_file_scheme: str, expected: str, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + result = FileNameFormatter._format_name(test_submission, test_file_scheme) + assert result == expected From 7228bc572cb970950e6df2d696d366910f726705 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Tue, 6 Apr 2021 16:38:52 +0300 Subject: [PATCH 232/276] Add //imgur.com/*.gifv --- bulkredditdownloader/site_downloaders/download_factory.py | 2 +- .../tests/site_downloaders/test_download_factory.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index 0bbd231..a1aad75 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -22,7 +22,7 @@ class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: url_beginning = r'\s*(https?://(www\.)?)' - if re.match(url_beginning + r'i\.imgur.*\.gifv$', url): + if re.match(url_beginning + r'(i\.)?imgur.*\.gifv$', url): return Imgur elif re.match(url_beginning + r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', url): return Direct diff --git a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py index e6bb7f4..935bac3 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py @@ -27,6 +27,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube ('https://i.redd.it/affyv0axd5k61.png', Direct), ('https://imgur.com/3ls94yv.jpeg', Direct), ('https://i.imgur.com/BuzvZwb.gifv', Imgur), + ('https://imgur.com/BuzvZwb.gifv', Imgur), ('https://i.imgur.com/6fNdLst.gif', Direct), ('https://imgur.com/a/MkxAzeg', Imgur), ('https://www.reddit.com/gallery/lu93m7', Gallery), From c90d98ac72e74ff6370c44f5bc38f0e1fe1bbfbe Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 7 Apr 2021 09:19:19 +1000 Subject: [PATCH 233/276] Add fix for Imgur gifv links --- bulkredditdownloader/site_downloaders/imgur.py | 2 +- bulkredditdownloader/tests/site_downloaders/test_imgur.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bulkredditdownloader/site_downloaders/imgur.py index 832729a..943d27b 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bulkredditdownloader/site_downloaders/imgur.py @@ -37,7 +37,7 @@ class Imgur(BaseDownloader): @staticmethod def _get_data(link: str) -> dict: - if re.match(r'.*i\.imgur\.com.*\.gifv$', link): + if re.match(r'.*\.gifv$', link): link = link.replace('i.imgur', 'imgur') link = link.rstrip('.gifv') diff --git a/bulkredditdownloader/tests/site_downloaders/test_imgur.py b/bulkredditdownloader/tests/site_downloaders/test_imgur.py index 451fa62..2b877f6 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/site_downloaders/test_imgur.py @@ -42,6 +42,9 @@ def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_d ('https://i.imgur.com/dLk3FGY.gifv', {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True} ), + ('https://imgur.com/BuzvZwb.gifv', + {'hash': 'BuzvZwb', 'title': '', 'description': 'Akron Glass Works', 'animated': True, 'mimetype': 'video/mp4'}, + ) )) def test_get_data_gif(test_url: str, expected_image_dict: dict): result = Imgur._get_data(test_url) From 97b10ee4e330dfbeab3bf8127b5d5cae4b3f9774 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 8 Apr 2021 11:08:12 +1000 Subject: [PATCH 234/276] Fix line length --- bulkredditdownloader/resource.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index dd0592f..de23ef3 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -33,7 +33,8 @@ class Resource: if response.status_code == 200: return response.content elif response.status_code in (301, 401, 403, 404): - raise BulkDownloaderException(f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') + raise BulkDownloaderException( + f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') else: raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') except requests.exceptions.ConnectionError as e: From 9aa1383b4316de3d61cf4012c57dcc470eee3651 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 8 Apr 2021 11:24:55 +1000 Subject: [PATCH 235/276] Add support for mobile Imgur links --- .../site_downloaders/download_factory.py | 2 +- .../site_downloaders/test_download_factory.py | 1 + .../tests/site_downloaders/test_imgur.py | 51 +++++++++++++------ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bulkredditdownloader/site_downloaders/download_factory.py index a1aad75..ba5c7e3 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bulkredditdownloader/site_downloaders/download_factory.py @@ -34,7 +34,7 @@ class DownloadFactory: return Gfycat elif re.match(url_beginning + r'gifdeliverynetwork', url): return GifDeliveryNetwork - elif re.match(url_beginning + r'imgur.*', url): + elif re.match(url_beginning + r'(m\.)?imgur.*', url): return Imgur elif re.match(url_beginning + r'redgifs.com', url): return Redgifs diff --git a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py index 935bac3..830eeeb 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py +++ b/bulkredditdownloader/tests/site_downloaders/test_download_factory.py @@ -40,6 +40,7 @@ from bulkredditdownloader.site_downloaders.youtube import Youtube ('https://m.youtube.com/watch?v=kr-FeojxzUM', Youtube), ('https://i.imgur.com/3SKrQfK.jpg?1', Direct), ('https://dynasty-scans.com/system/images_images/000/017/819/original/80215103_p0.png?1612232781', Direct), + ('https://m.imgur.com/a/py3RW0j', Imgur), )) def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) diff --git a/bulkredditdownloader/tests/site_downloaders/test_imgur.py b/bulkredditdownloader/tests/site_downloaders/test_imgur.py index 2b877f6..542db0b 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/site_downloaders/test_imgur.py @@ -12,21 +12,42 @@ from bulkredditdownloader.site_downloaders.imgur import Imgur @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_gen_dict', 'expected_image_dict'), ( - ('https://imgur.com/a/xWZsDDP', - {'num_images': '1', 'id': 'xWZsDDP', 'hash': 'xWZsDDP'}, - [{'hash': 'ypa8YfS', 'title': '', 'ext': '.png', 'animated': False}]), - ('https://imgur.com/gallery/IjJJdlC', - {'num_images': 1, 'id': 384898055, 'hash': 'IjJJdlC'}, - [{'hash': 'CbbScDt', 'description': 'watch when he gets it', 'ext': '.gif', 'animated': True, 'has_sound': False}], - ), - ('https://imgur.com/a/dcc84Gt', - {'num_images': '4', 'id': 'dcc84Gt', 'hash': 'dcc84Gt'}, - [ - {'hash': 'ylx0Kle', 'ext': '.jpg', 'title': ''}, - {'hash': 'TdYfKbK', 'ext': '.jpg', 'title': ''}, - {'hash': 'pCxGbe8', 'ext': '.jpg', 'title': ''}, - {'hash': 'TSAkikk', 'ext': '.jpg', 'title': ''}, - ]), + ( + 'https://imgur.com/a/xWZsDDP', + {'num_images': '1', 'id': 'xWZsDDP', 'hash': 'xWZsDDP'}, + [ + {'hash': 'ypa8YfS', 'title': '', 'ext': '.png', 'animated': False} + ] + ), + ( + 'https://imgur.com/gallery/IjJJdlC', + {'num_images': 1, 'id': 384898055, 'hash': 'IjJJdlC'}, + [ + {'hash': 'CbbScDt', + 'description': 'watch when he gets it', + 'ext': '.gif', + 'animated': True, + 'has_sound': False + } + ], + ), + ( + 'https://imgur.com/a/dcc84Gt', + {'num_images': '4', 'id': 'dcc84Gt', 'hash': 'dcc84Gt'}, + [ + {'hash': 'ylx0Kle', 'ext': '.jpg', 'title': ''}, + {'hash': 'TdYfKbK', 'ext': '.jpg', 'title': ''}, + {'hash': 'pCxGbe8', 'ext': '.jpg', 'title': ''}, + {'hash': 'TSAkikk', 'ext': '.jpg', 'title': ''}, + ] + ), + ( + 'https://m.imgur.com/a/py3RW0j', + {'num_images': '1', 'id': 'py3RW0j', 'hash': 'py3RW0j', }, + [ + {'hash': 'K24eQmK', 'has_sound': False, 'ext': '.jpg'} + ], + ), )) def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_dict: list[dict]): result = Imgur._get_data(test_url) From 49c82dc12a05989e000eb65265ee546e5aee7e3b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 8 Apr 2021 11:27:49 +1000 Subject: [PATCH 236/276] Fix some formatting --- .../tests/site_downloaders/test_imgur.py | 51 ++++++++++++------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/bulkredditdownloader/tests/site_downloaders/test_imgur.py b/bulkredditdownloader/tests/site_downloaders/test_imgur.py index 542db0b..bf842ff 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/site_downloaders/test_imgur.py @@ -60,12 +60,20 @@ def test_get_data_album(test_url: str, expected_gen_dict: dict, expected_image_d @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_image_dict'), ( - ('https://i.imgur.com/dLk3FGY.gifv', - {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True} - ), - ('https://imgur.com/BuzvZwb.gifv', - {'hash': 'BuzvZwb', 'title': '', 'description': 'Akron Glass Works', 'animated': True, 'mimetype': 'video/mp4'}, - ) + ( + 'https://i.imgur.com/dLk3FGY.gifv', + {'hash': 'dLk3FGY', 'title': '', 'ext': '.mp4', 'animated': True} + ), + ( + 'https://imgur.com/BuzvZwb.gifv', + { + 'hash': 'BuzvZwb', + 'title': '', + 'description': 'Akron Glass Works', + 'animated': True, + 'mimetype': 'video/mp4' + }, + ), )) def test_get_data_gif(test_url: str, expected_image_dict: dict): result = Imgur._get_data(test_url) @@ -88,7 +96,7 @@ def test_imgur_extension_validation_good(test_extension: str): 'bad', '.avi', '.test', - '.flac' + '.flac', )) def test_imgur_extension_validation_bad(test_extension: str): with pytest.raises(SiteDownloaderError): @@ -97,18 +105,23 @@ def test_imgur_extension_validation_bad(test_extension: str): @pytest.mark.online @pytest.mark.parametrize(('test_url', 'expected_hashes'), ( - ('https://imgur.com/a/xWZsDDP', ( - 'f551d6e6b0fef2ce909767338612e31b', - )), - ('https://imgur.com/gallery/IjJJdlC', ( - '7227d4312a9779b74302724a0cfa9081', - )), - ('https://imgur.com/a/dcc84Gt', ( - 'cf1158e1de5c3c8993461383b96610cf', - '28d6b791a2daef8aa363bf5a3198535d', - '248ef8f2a6d03eeb2a80d0123dbaf9b6', - '029c475ce01b58fdf1269d8771d33913' - )), + ( + 'https://imgur.com/a/xWZsDDP', + ('f551d6e6b0fef2ce909767338612e31b',) + ), + ( + 'https://imgur.com/gallery/IjJJdlC', + ('7227d4312a9779b74302724a0cfa9081',), + ), + ( + 'https://imgur.com/a/dcc84Gt', + ( + 'cf1158e1de5c3c8993461383b96610cf', + '28d6b791a2daef8aa363bf5a3198535d', + '248ef8f2a6d03eeb2a80d0123dbaf9b6', + '029c475ce01b58fdf1269d8771d33913', + ), + ), )) def test_find_resources(test_url: str, expected_hashes: list[str]): mock_download = Mock() From 37ca4134245ab070b46366912e030c93b11bec98 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 8 Apr 2021 13:00:58 +1000 Subject: [PATCH 237/276] Change logger message to warning --- bulkredditdownloader/resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index de23ef3..92d1633 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -38,7 +38,7 @@ class Resource: else: raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') except requests.exceptions.ConnectionError as e: - logger.log(9, f'Error occured downloading resource, waiting {wait_time} seconds: {e}') + logger.warning(f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}') time.sleep(wait_time) if wait_time < 300: return Resource.retry_download(url, wait_time + 60) From 34c8a9a5d07d285036fdc77011348688ce262ec9 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Fri, 9 Apr 2021 23:15:45 +1000 Subject: [PATCH 238/276] Add option to download user comments (#258) * Add option to download user comments * Update README --- README.md | 2 ++ bulkredditdownloader/__main__.py | 1 + bulkredditdownloader/archiver.py | 9 +++++++++ bulkredditdownloader/configuration.py | 1 + bulkredditdownloader/tests/test_integration.py | 13 +++++++++++++ 5 files changed, 26 insertions(+) diff --git a/README.md b/README.md index daff766..082da6e 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,8 @@ The following options apply only to the `download` command. This command downloa The following options are for the `archive` command specifically. +- `--all-comments` + - When combined with the `--user` option, this will download all the user's comments - `-f, --format` - This specifies the format of the data file saved to disk - The following formats are available: diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index bc4e4e7..4c1158e 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -70,6 +70,7 @@ def cli_download(context: click.Context, **_): @cli.command('archive') @_add_common_options +@click.option('--all-comments', is_flag=True, default=None) @click.option('-f,', '--format', type=click.Choice(('xml', 'json', 'yaml')), default=None) @click.pass_context def cli_archive(context: click.Context, **_): diff --git a/bulkredditdownloader/archiver.py b/bulkredditdownloader/archiver.py index db4ee92..b44d4de 100644 --- a/bulkredditdownloader/archiver.py +++ b/bulkredditdownloader/archiver.py @@ -4,6 +4,7 @@ import json import logging import re +from typing import Iterator import dict2xml import praw.models @@ -41,6 +42,14 @@ class Archiver(RedditDownloader): supplied_submissions.append(self.reddit_instance.submission(url=sub_id)) return [supplied_submissions] + def _get_user_data(self) -> list[Iterator]: + results = super(Archiver, self)._get_user_data() + if self.args.user and self.args.all_comments: + sort = self._determine_sort_function() + logger.debug(f'Retrieving comments of user {self.args.user}') + results.append(sort(self.reddit_instance.redditor(self.args.user).comments, limit=self.args.limit)) + return results + @staticmethod def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Comment)) -> BaseArchiveEntry: if isinstance(praw_item, praw.models.Submission): diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index 5cb23b3..aa634b6 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -37,6 +37,7 @@ class Configuration(Namespace): # Archiver-specific options self.format = 'json' + self.all_comments = False def process_click_arguments(self, context: click.Context): for arg_key in context.params.keys(): diff --git a/bulkredditdownloader/tests/test_integration.py b/bulkredditdownloader/tests/test_integration.py index 2991b68..23cdbd4 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bulkredditdownloader/tests/test_integration.py @@ -202,6 +202,19 @@ def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): assert re.search(r'Writing entry .*? to file in .*? format', result.output) +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['--user', 'me', '--authenticate', '--all-comments', '-L', '10'], +)) +def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['archive', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow From 0c960a4d0c2b0d0fb857ca4cc41b22ed812c0b53 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 10 Apr 2021 14:41:42 +1000 Subject: [PATCH 239/276] Add minimum python version --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index eb3ac3d..4c907fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,6 +2,7 @@ name = Bulk Downloader for Reddit author = Ali Parlakci author-email = parlakciali@gmail.com +python_requires = >=3.9 [files] packages = bulkredditdownloader From aa8032e95f95b7020f66a2e33a4e391c9eb68bb0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 16:47:04 +1000 Subject: [PATCH 240/276] Use config value to specify max wait time --- bulkredditdownloader/default_config.cfg | 3 ++- bulkredditdownloader/downloader.py | 2 +- bulkredditdownloader/resource.py | 11 ++++++----- .../tests/site_downloaders/test_direct.py | 2 +- .../tests/site_downloaders/test_erome.py | 2 +- .../tests/site_downloaders/test_gallery.py | 2 +- .../tests/site_downloaders/test_gfycat.py | 2 +- .../site_downloaders/test_gif_delivery_network.py | 2 +- .../tests/site_downloaders/test_imgur.py | 2 +- .../tests/site_downloaders/test_redgifs.py | 2 +- .../tests/site_downloaders/test_vreddit.py | 2 +- .../tests/site_downloaders/test_youtube.py | 2 +- bulkredditdownloader/tests/test_resource.py | 2 +- 13 files changed, 19 insertions(+), 17 deletions(-) diff --git a/bulkredditdownloader/default_config.cfg b/bulkredditdownloader/default_config.cfg index 97d6bb9..1bcb02b 100644 --- a/bulkredditdownloader/default_config.cfg +++ b/bulkredditdownloader/default_config.cfg @@ -2,4 +2,5 @@ client_id = U-6gk4ZCh3IeNQ client_secret = 7CZHY6AmKweZME5s50SfDGylaPg scopes = identity, history, read, save -backup_log_count = 3 \ No newline at end of file +backup_log_count = 3 +max_wait_time = 120 \ No newline at end of file diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 81a06c1..02c3511 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -362,7 +362,7 @@ class RedditDownloader: logger.debug(f'File {destination} already exists, continuing') else: try: - res.download() + res.download(self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120)) except errors.BulkDownloaderException as e: logger.error( f'Failed to download resource {res.url} with downloader {downloader_class.__name__}: {e}') diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index 92d1633..b925bd3 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -27,7 +27,8 @@ class Resource: self.extension = self._determine_extension() @staticmethod - def retry_download(url: str, wait_time: int) -> Optional[bytes]: + def retry_download(url: str, max_wait_time: int) -> Optional[bytes]: + wait_time = 60 try: response = requests.get(url) if response.status_code == 200: @@ -40,16 +41,16 @@ class Resource: except requests.exceptions.ConnectionError as e: logger.warning(f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}') time.sleep(wait_time) - if wait_time < 300: - return Resource.retry_download(url, wait_time + 60) + if wait_time < max_wait_time: + return Resource.retry_download(url, max_wait_time) else: logger.error(f'Max wait time exceeded for resource at url {url}') raise - def download(self): + def download(self, max_wait_time: int): if not self.content: try: - content = self.retry_download(self.url, 0) + content = self.retry_download(self.url, max_wait_time) except requests.exceptions.ConnectionError as e: raise BulkDownloaderException(f'Could not download resource: {e}') except BulkDownloaderException: diff --git a/bulkredditdownloader/tests/site_downloaders/test_direct.py b/bulkredditdownloader/tests/site_downloaders/test_direct.py index 9f8163a..3e891b9 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_direct.py +++ b/bulkredditdownloader/tests/site_downloaders/test_direct.py @@ -21,5 +21,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download() + resources[0].download(120) assert resources[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/site_downloaders/test_erome.py b/bulkredditdownloader/tests/site_downloaders/test_erome.py index 2fb7cf6..8a052a0 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_erome.py +++ b/bulkredditdownloader/tests/site_downloaders/test_erome.py @@ -52,6 +52,6 @@ def test_download_resource(test_url: str, expected_hashes: tuple[str]): mock_submission.url = test_url test_site = Erome(mock_submission) resources = test_site.find_resources() - [res.download() for res in resources] + [res.download(120) for res in resources] resource_hashes = [res.hash.hexdigest() for res in resources] assert len(resource_hashes) == len(expected_hashes) diff --git a/bulkredditdownloader/tests/site_downloaders/test_gallery.py b/bulkredditdownloader/tests/site_downloaders/test_gallery.py index d33c632..f61e04f 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_gallery.py +++ b/bulkredditdownloader/tests/site_downloaders/test_gallery.py @@ -55,6 +55,6 @@ def test_gallery_download(test_submission_id: str, expected_hashes: set[str], re test_submission = reddit_instance.submission(id=test_submission_id) gallery = Gallery(test_submission) results = gallery.find_resources() - [res.download() for res in results] + [res.download(120) for res in results] hashes = [res.hash.hexdigest() for res in results] assert set(hashes) == expected_hashes diff --git a/bulkredditdownloader/tests/site_downloaders/test_gfycat.py b/bulkredditdownloader/tests/site_downloaders/test_gfycat.py index 5babe8c..8290914 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_gfycat.py +++ b/bulkredditdownloader/tests/site_downloaders/test_gfycat.py @@ -32,5 +32,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download() + resources[0].download(120) assert resources[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py b/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py index a4399d8..9856536 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py +++ b/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py @@ -33,5 +33,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download() + resources[0].download(120) assert resources[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/site_downloaders/test_imgur.py b/bulkredditdownloader/tests/site_downloaders/test_imgur.py index bf842ff..2f22c2b 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_imgur.py +++ b/bulkredditdownloader/tests/site_downloaders/test_imgur.py @@ -129,7 +129,7 @@ def test_find_resources(test_url: str, expected_hashes: list[str]): downloader = Imgur(mock_download) results = downloader.find_resources() assert all([isinstance(res, Resource) for res in results]) - [res.download() for res in results] + [res.download(120) for res in results] hashes = set([res.hash.hexdigest() for res in results]) assert len(results) == len(expected_hashes) assert hashes == set(expected_hashes) diff --git a/bulkredditdownloader/tests/site_downloaders/test_redgifs.py b/bulkredditdownloader/tests/site_downloaders/test_redgifs.py index 2aa4227..0d8cb79 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_redgifs.py +++ b/bulkredditdownloader/tests/site_downloaders/test_redgifs.py @@ -33,5 +33,5 @@ def test_download_resource(test_url: str, expected_hash: str): resources = test_site.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download() + resources[0].download(120) assert resources[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/site_downloaders/test_vreddit.py b/bulkredditdownloader/tests/site_downloaders/test_vreddit.py index bf96d67..d7377b6 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_vreddit.py +++ b/bulkredditdownloader/tests/site_downloaders/test_vreddit.py @@ -19,5 +19,5 @@ def test_find_resources(test_submission_id: str, expected_hash: str, reddit_inst resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download() + resources[0].download(120) assert resources[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/site_downloaders/test_youtube.py b/bulkredditdownloader/tests/site_downloaders/test_youtube.py index 8af08d0..8c7bfca 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_youtube.py +++ b/bulkredditdownloader/tests/site_downloaders/test_youtube.py @@ -22,5 +22,5 @@ def test_find_resources(test_url: str, expected_hash: str): resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) - resources[0].download() + resources[0].download(120) assert resources[0].hash.hexdigest() == expected_hash diff --git a/bulkredditdownloader/tests/test_resource.py b/bulkredditdownloader/tests/test_resource.py index c2647b3..0292f06 100644 --- a/bulkredditdownloader/tests/test_resource.py +++ b/bulkredditdownloader/tests/test_resource.py @@ -28,5 +28,5 @@ def test_resource_get_extension(test_url: str, expected: str): )) def test_download_online_resource(test_url: str, expected_hash: str): test_resource = Resource(MagicMock(), test_url) - test_resource.download() + test_resource.download(120) assert test_resource.hash.hexdigest() == expected_hash From af95f3ff66c8f9d9385e5585394dac2cb97876e8 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 16:53:01 +1000 Subject: [PATCH 241/276] Fix some formatting --- bulkredditdownloader/downloader.py | 39 ++++++++++++++++++------------ 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 02c3511..3ad1918 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -101,7 +101,8 @@ class RedditDownloader: oauth2_authenticator = OAuth2Authenticator( scopes, self.cfg_parser.get('DEFAULT', 'client_id'), - self.cfg_parser.get('DEFAULT', 'client_secret')) + self.cfg_parser.get('DEFAULT', 'client_secret'), + ) token = oauth2_authenticator.retrieve_new_token() self.cfg_parser['DEFAULT']['user_token'] = token with open(self.config_location, 'w') as file: @@ -109,16 +110,20 @@ class RedditDownloader: token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location) self.authenticated = True - self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), - client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), - user_agent=socket.gethostname(), - token_manager=token_manager) + self.reddit_instance = praw.Reddit( + client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + token_manager=token_manager, + ) else: logger.debug('Using unauthenticated Reddit instance') self.authenticated = False - self.reddit_instance = praw.Reddit(client_id=self.cfg_parser.get('DEFAULT', 'client_id'), - client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), - user_agent=socket.gethostname()) + self.reddit_instance = praw.Reddit( + client_id=self.cfg_parser.get('DEFAULT', 'client_id'), + client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), + user_agent=socket.gethostname(), + ) def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: master_list = [] @@ -146,11 +151,12 @@ class RedditDownloader: self.cfg_parser.read(cfg_path) self.config_location = cfg_path return - possible_paths = [Path('./config.cfg'), - Path('./default_config.cfg'), - Path(self.config_directory, 'config.cfg'), - Path(self.config_directory, 'default_config.cfg'), - ] + possible_paths = [ + Path('./config.cfg'), + Path('./default_config.cfg'), + Path(self.config_directory, 'config.cfg'), + Path(self.config_directory, 'default_config.cfg'), + ] self.config_location = None for path in possible_paths: if path.resolve().expanduser().exists(): @@ -210,7 +216,8 @@ class RedditDownloader: reddit.search( self.args.search, sort=self.sort_filter.name.lower(), - limit=self.args.limit)) + limit=self.args.limit, + )) logger.debug( f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"') else: @@ -280,7 +287,9 @@ class RedditDownloader: logger.debug(f'Retrieving submitted posts of user {self.args.user}') generators.append( sort_function( - self.reddit_instance.redditor(self.args.user).submissions, limit=self.args.limit)) + self.reddit_instance.redditor(self.args.user).submissions, + limit=self.args.limit, + )) if not self.authenticated and any((self.args.upvoted, self.args.saved)): logger.warning('Accessing user lists requires authentication') else: From 36291d5ea6645105d5846ff92bfe9f05066a1473 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 17:12:41 +1000 Subject: [PATCH 242/276] Add option to specify wait time as option --- bulkredditdownloader/__main__.py | 1 + bulkredditdownloader/configuration.py | 1 + bulkredditdownloader/downloader.py | 16 +++++++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/__main__.py b/bulkredditdownloader/__main__.py index 4c1158e..fb0081a 100644 --- a/bulkredditdownloader/__main__.py +++ b/bulkredditdownloader/__main__.py @@ -48,6 +48,7 @@ def cli(): @click.option('--file-scheme', default=None, type=str) @click.option('--folder-scheme', default=None, type=str) @click.option('--make-hard-links', is_flag=True, default=None) +@click.option('--max-wait-time', type=int, default=None) @click.option('--no-dupes', is_flag=True, default=None) @click.option('--search-existing', is_flag=True, default=None) @click.option('--skip', default=None, multiple=True) diff --git a/bulkredditdownloader/configuration.py b/bulkredditdownloader/configuration.py index aa634b6..e6d0af7 100644 --- a/bulkredditdownloader/configuration.py +++ b/bulkredditdownloader/configuration.py @@ -17,6 +17,7 @@ class Configuration(Namespace): self.exclude_id_file = [] self.limit: Optional[int] = None self.link: list[str] = [] + self.max_wait_time = None self.multireddit: list[str] = [] self.no_dupes: bool = False self.saved: bool = False diff --git a/bulkredditdownloader/downloader.py b/bulkredditdownloader/downloader.py index 3ad1918..434eb74 100644 --- a/bulkredditdownloader/downloader.py +++ b/bulkredditdownloader/downloader.py @@ -70,6 +70,8 @@ class RedditDownloader: self._load_config() self._create_file_logger() + self._read_config() + self.download_filter = self._create_download_filter() logger.log(9, 'Created download filter') self.time_filter = self._create_time_filter() @@ -91,6 +93,18 @@ class RedditDownloader: self.authenticator = self._create_authenticator() logger.log(9, 'Created site authenticator') + def _read_config(self): + """Read any cfg values that need to be processed""" + if self.args.max_wait_time is None: + if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'): + self.cfg_parser.set('DEFAULT', 'max_wait_time', '120') + logger.log(9, 'Wrote default download wait time download to config file') + self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time') + logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds') + # Update config on disk + with open(self.config_location, 'w') as file: + self.cfg_parser.write(file) + def _create_reddit_instance(self): if self.args.authenticate: logger.debug('Using authenticated Reddit instance') @@ -371,7 +385,7 @@ class RedditDownloader: logger.debug(f'File {destination} already exists, continuing') else: try: - res.download(self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120)) + res.download(self.args.max_wait_time) except errors.BulkDownloaderException as e: logger.error( f'Failed to download resource {res.url} with downloader {downloader_class.__name__}: {e}') From ef0b2d437f70bc593f740fb7b10e90fe147b3ddb Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 17:30:06 +1000 Subject: [PATCH 243/276] Update list of unrecoverable HTTP codes --- bulkredditdownloader/resource.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index b925bd3..6577593 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -33,7 +33,18 @@ class Resource: response = requests.get(url) if response.status_code == 200: return response.content - elif response.status_code in (301, 401, 403, 404): + elif response.status_code in ( + 301, + 400, + 401, + 403, + 404, + 407, + 410, + 500, + 501, + 502, + ): raise BulkDownloaderException( f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') else: From 3da58dbd5d3a2497d8103cb5fba6f5ea01ec5f90 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 17:44:11 +1000 Subject: [PATCH 244/276] Update README --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 082da6e..a5569a4 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,10 @@ The following options apply only to the `download` command. This command downloa - `--make-hard-links` - This flag will create hard links to an existing file when a duplicate is downloaded - This will make the file appear in multiple directories while only taking the space of a single instance +- `--max-wait-time` + - This option specifies the maximum wait time for downloading a resource + - The default is 120 seconds + - See [Rate Limiting](#rate-limiting) for details - `--no-dupes` - This flag will not redownload files if they already exist somewhere in the root folder tree - This is calculated by MD5 hash @@ -201,6 +205,7 @@ The logging output for each run of the BDFR will be saved to this directory in t The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys **must** be included in the configuration file supplied. - `backup_log_count` + - `max_wait_time` - `client_id` - `client_secret` - `scopes` @@ -209,6 +214,14 @@ All of these should not be modified unless you know what you're doing, as the de Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, increase this number. +#### Rate Limiting + +The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases. + +To this end, the BDFR will sleep for a time before retrying the download, giving the remote server time to "rest". This is done in 60 second increments. For example, if a rate-limiting-related error is given, the BDFR will sleep for 60 seconds before retrying. Then, if the same type of error occurs, it will sleep for another 120 seconds, then 180 seconds, and so on. + +The option `--max-wait-time` and the configuration option `max_wait_time` both specify the maximum time the BDFR will wait. If both are present, the command-line option takes precedence. For instance, the default is 120, so the BDFR will wait for 60 seconds, then 120 seconds, and then move one. **Note that this results in a total time of 180 seconds trying the same download**. If you wish to try to bypass the rate-limiting system on the remote site, increasing the maximum wait time may help. However, note that the actual wait times increase exponentially if the resource is not downloaded i.e. specifying a max value of 300 (5 minutes), can make the BDFR pause for 15 minutes on one submission, not 5, in the worst case. + ## Contributing If you wish to contribute, see [Contributing](docs/CONTRIBUTING.md) for more information. From bd9f276acc2b2c74fa56f05bc0c999d00826834b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 17:58:32 +1000 Subject: [PATCH 245/276] Rename module --- README.md | 16 ++++++------ {bulkredditdownloader => bdfr}/__init__.py | 0 {bulkredditdownloader => bdfr}/__main__.py | 6 ++--- .../archive_entry/__init__.py | 0 .../archive_entry/base_archive_entry.py | 0 .../archive_entry/comment_archive_entry.py | 2 +- .../archive_entry/submission_archive_entry.py | 2 +- {bulkredditdownloader => bdfr}/archiver.py | 14 +++++----- .../configuration.py | 0 .../default_config.cfg | 0 .../download_filter.py | 0 {bulkredditdownloader => bdfr}/downloader.py | 18 ++++++------- {bulkredditdownloader => bdfr}/exceptions.py | 0 .../file_name_formatter.py | 4 +-- {bulkredditdownloader => bdfr}/oauth2.py | 2 +- {bulkredditdownloader => bdfr}/resource.py | 2 +- .../site_authenticator.py | 0 .../site_downloaders/__init__.py | 0 .../site_downloaders/base_downloader.py | 6 ++--- .../site_downloaders/direct.py | 6 ++--- .../site_downloaders/download_factory.py | 24 ++++++++--------- .../site_downloaders/erome.py | 8 +++--- .../site_downloaders/gallery.py | 8 +++--- .../site_downloaders/gfycat.py | 8 +++--- .../site_downloaders/gif_delivery_network.py | 8 +++--- .../site_downloaders/imgur.py | 8 +++--- .../site_downloaders/redgifs.py | 8 +++--- .../site_downloaders/self_post.py | 6 ++--- .../site_downloaders/vreddit.py | 6 ++--- .../site_downloaders/youtube.py | 8 +++--- .../tests/__init__.py | 0 .../tests/archive_entry/__init__.py | 0 .../test_comment_archive_entry.py | 2 +- .../test_submission_archive_entry.py | 2 +- .../tests/conftest.py | 2 +- .../tests/site_downloaders/__init__.py | 0 .../tests/site_downloaders/test_direct.py | 4 +-- .../site_downloaders/test_download_factory.py | 26 +++++++++---------- .../tests/site_downloaders/test_erome.py | 2 +- .../tests/site_downloaders/test_gallery.py | 2 +- .../tests/site_downloaders/test_gfycat.py | 4 +-- .../test_gif_delivery_network.py | 4 +-- .../tests/site_downloaders/test_imgur.py | 6 ++--- .../tests/site_downloaders/test_redgifs.py | 4 +-- .../tests/site_downloaders/test_self_post.py | 4 +-- .../tests/site_downloaders/test_vreddit.py | 4 +-- .../tests/site_downloaders/test_youtube.py | 4 +-- .../tests/test_archiver.py | 4 +-- .../tests/test_configuration.py | 2 +- .../tests/test_download_filter.py | 2 +- .../tests/test_downloader.py | 14 +++++----- .../tests/test_file_name_formatter.py | 4 +-- .../tests/test_integration.py | 2 +- .../tests/test_oauth2.py | 4 +-- .../tests/test_resource.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 57 files changed, 139 insertions(+), 139 deletions(-) rename {bulkredditdownloader => bdfr}/__init__.py (100%) rename {bulkredditdownloader => bdfr}/__main__.py (95%) rename {bulkredditdownloader => bdfr}/archive_entry/__init__.py (100%) rename {bulkredditdownloader => bdfr}/archive_entry/base_archive_entry.py (100%) rename {bulkredditdownloader => bdfr}/archive_entry/comment_archive_entry.py (86%) rename {bulkredditdownloader => bdfr}/archive_entry/submission_archive_entry.py (94%) rename {bulkredditdownloader => bdfr}/archiver.py (88%) rename {bulkredditdownloader => bdfr}/configuration.py (100%) rename {bulkredditdownloader => bdfr}/default_config.cfg (100%) rename {bulkredditdownloader => bdfr}/download_filter.py (100%) rename {bulkredditdownloader => bdfr}/downloader.py (96%) rename {bulkredditdownloader => bdfr}/exceptions.py (100%) rename {bulkredditdownloader => bdfr}/file_name_formatter.py (98%) rename {bulkredditdownloader => bdfr}/oauth2.py (97%) rename {bulkredditdownloader => bdfr}/resource.py (97%) rename {bulkredditdownloader => bdfr}/site_authenticator.py (100%) rename {bulkredditdownloader => bdfr}/site_downloaders/__init__.py (100%) rename {bulkredditdownloader => bdfr}/site_downloaders/base_downloader.py (83%) rename {bulkredditdownloader => bdfr}/site_downloaders/direct.py (63%) rename {bulkredditdownloader => bdfr}/site_downloaders/download_factory.py (63%) rename {bulkredditdownloader => bdfr}/site_downloaders/erome.py (81%) rename {bulkredditdownloader => bdfr}/site_downloaders/gallery.py (83%) rename {bulkredditdownloader => bdfr}/site_downloaders/gfycat.py (81%) rename {bulkredditdownloader => bdfr}/site_downloaders/gif_delivery_network.py (75%) rename {bulkredditdownloader => bdfr}/site_downloaders/imgur.py (90%) rename {bulkredditdownloader => bdfr}/site_downloaders/redgifs.py (84%) rename {bulkredditdownloader => bdfr}/site_downloaders/self_post.py (86%) rename {bulkredditdownloader => bdfr}/site_downloaders/vreddit.py (68%) rename {bulkredditdownloader => bdfr}/site_downloaders/youtube.py (84%) rename {bulkredditdownloader => bdfr}/tests/__init__.py (100%) rename {bulkredditdownloader => bdfr}/tests/archive_entry/__init__.py (100%) rename {bulkredditdownloader => bdfr}/tests/archive_entry/test_comment_archive_entry.py (92%) rename {bulkredditdownloader => bdfr}/tests/archive_entry/test_submission_archive_entry.py (92%) rename {bulkredditdownloader => bdfr}/tests/conftest.py (95%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/__init__.py (100%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_direct.py (86%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_download_factory.py (68%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_erome.py (97%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_gallery.py (97%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_gfycat.py (91%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_gif_delivery_network.py (89%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_imgur.py (95%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_redgifs.py (91%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_self_post.py (85%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_vreddit.py (84%) rename {bulkredditdownloader => bdfr}/tests/site_downloaders/test_youtube.py (86%) rename {bulkredditdownloader => bdfr}/tests/test_archiver.py (93%) rename {bulkredditdownloader => bdfr}/tests/test_configuration.py (90%) rename {bulkredditdownloader => bdfr}/tests/test_download_filter.py (96%) rename {bulkredditdownloader => bdfr}/tests/test_downloader.py (97%) rename {bulkredditdownloader => bdfr}/tests/test_file_name_formatter.py (98%) rename {bulkredditdownloader => bdfr}/tests/test_integration.py (99%) rename {bulkredditdownloader => bdfr}/tests/test_oauth2.py (93%) rename {bulkredditdownloader => bdfr}/tests/test_resource.py (95%) diff --git a/README.md b/README.md index a5569a4..519bad1 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,10 @@ This is a tool to download submissions or submission data from Reddit. It can be Some quick reference commands are: - - `python3 -m bulkredditdownloader download --subreddit Python -L 10` - - `python3 -m bulkredditdownloader download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}'` - - `python3 -m bulkredditdownloader download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links` - - `python3 -m bulkredditdownloader archive --subreddit all --format yaml -L 500 --folder-scheme ''` + - `python3 -m bdfr download --subreddit Python -L 10` + - `python3 -m bdfr download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}'` + - `python3 -m bdfr download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links` + - `python3 -m bdfr archive --subreddit all --format yaml -L 500 --folder-scheme ''` ## Usage @@ -189,14 +189,14 @@ It is highly recommended that the file name scheme contain the parameter `{POSTI ## Configuration The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be: - - `C:\Documents and Settings\\Application Data\Local Settings\BDFR\bulkredditdownloader` or - - `C:\Documents and Settings\\Application Data\BDFR\bulkredditdownloader` + - `C:\Documents and Settings\\Application Data\Local Settings\BDFR\bdfr` or + - `C:\Documents and Settings\\Application Data\BDFR\bdfr` On Mac OSX, this will be: - - `~/Library/Application Support/bulkredditdownloader`. + - `~/Library/Application Support/bdfr`. Lastly, on a Linux system, this will be: - - `~/.local/share/bulkredditdownloader` + - `~/.local/share/bdfr` The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report. diff --git a/bulkredditdownloader/__init__.py b/bdfr/__init__.py similarity index 100% rename from bulkredditdownloader/__init__.py rename to bdfr/__init__.py diff --git a/bulkredditdownloader/__main__.py b/bdfr/__main__.py similarity index 95% rename from bulkredditdownloader/__main__.py rename to bdfr/__main__.py index fb0081a..c58f7ca 100644 --- a/bulkredditdownloader/__main__.py +++ b/bdfr/__main__.py @@ -5,9 +5,9 @@ import sys import click -from bulkredditdownloader.archiver import Archiver -from bulkredditdownloader.configuration import Configuration -from bulkredditdownloader.downloader import RedditDownloader +from bdfr.archiver import Archiver +from bdfr.configuration import Configuration +from bdfr.downloader import RedditDownloader logger = logging.getLogger() diff --git a/bulkredditdownloader/archive_entry/__init__.py b/bdfr/archive_entry/__init__.py similarity index 100% rename from bulkredditdownloader/archive_entry/__init__.py rename to bdfr/archive_entry/__init__.py diff --git a/bulkredditdownloader/archive_entry/base_archive_entry.py b/bdfr/archive_entry/base_archive_entry.py similarity index 100% rename from bulkredditdownloader/archive_entry/base_archive_entry.py rename to bdfr/archive_entry/base_archive_entry.py diff --git a/bulkredditdownloader/archive_entry/comment_archive_entry.py b/bdfr/archive_entry/comment_archive_entry.py similarity index 86% rename from bulkredditdownloader/archive_entry/comment_archive_entry.py rename to bdfr/archive_entry/comment_archive_entry.py index 51a0947..1bb5c18 100644 --- a/bulkredditdownloader/archive_entry/comment_archive_entry.py +++ b/bdfr/archive_entry/comment_archive_entry.py @@ -5,7 +5,7 @@ import logging import praw.models -from bulkredditdownloader.archive_entry.base_archive_entry import BaseArchiveEntry +from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py similarity index 94% rename from bulkredditdownloader/archive_entry/submission_archive_entry.py rename to bdfr/archive_entry/submission_archive_entry.py index 90da7bc..aaa423b 100644 --- a/bulkredditdownloader/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -5,7 +5,7 @@ import logging import praw.models -from bulkredditdownloader.archive_entry.base_archive_entry import BaseArchiveEntry +from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/archiver.py b/bdfr/archiver.py similarity index 88% rename from bulkredditdownloader/archiver.py rename to bdfr/archiver.py index b44d4de..c6e4299 100644 --- a/bulkredditdownloader/archiver.py +++ b/bdfr/archiver.py @@ -10,13 +10,13 @@ import dict2xml import praw.models import yaml -from bulkredditdownloader.archive_entry.base_archive_entry import BaseArchiveEntry -from bulkredditdownloader.archive_entry.comment_archive_entry import CommentArchiveEntry -from bulkredditdownloader.archive_entry.submission_archive_entry import SubmissionArchiveEntry -from bulkredditdownloader.configuration import Configuration -from bulkredditdownloader.downloader import RedditDownloader -from bulkredditdownloader.exceptions import ArchiverError -from bulkredditdownloader.resource import Resource +from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry +from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry +from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry +from bdfr.configuration import Configuration +from bdfr.downloader import RedditDownloader +from bdfr.exceptions import ArchiverError +from bdfr.resource import Resource logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/configuration.py b/bdfr/configuration.py similarity index 100% rename from bulkredditdownloader/configuration.py rename to bdfr/configuration.py diff --git a/bulkredditdownloader/default_config.cfg b/bdfr/default_config.cfg similarity index 100% rename from bulkredditdownloader/default_config.cfg rename to bdfr/default_config.cfg diff --git a/bulkredditdownloader/download_filter.py b/bdfr/download_filter.py similarity index 100% rename from bulkredditdownloader/download_filter.py rename to bdfr/download_filter.py diff --git a/bulkredditdownloader/downloader.py b/bdfr/downloader.py similarity index 96% rename from bulkredditdownloader/downloader.py rename to bdfr/downloader.py index 434eb74..4197b04 100644 --- a/bulkredditdownloader/downloader.py +++ b/bdfr/downloader.py @@ -22,13 +22,13 @@ import praw.exceptions import praw.models import prawcore -import bulkredditdownloader.exceptions as errors -from bulkredditdownloader.configuration import Configuration -from bulkredditdownloader.download_filter import DownloadFilter -from bulkredditdownloader.file_name_formatter import FileNameFormatter -from bulkredditdownloader.oauth2 import OAuth2Authenticator, OAuth2TokenManager -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory +import bdfr.exceptions as errors +from bdfr.configuration import Configuration +from bdfr.download_filter import DownloadFilter +from bdfr.file_name_formatter import FileNameFormatter +from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.download_factory import DownloadFactory logger = logging.getLogger(__name__) @@ -59,7 +59,7 @@ class RedditTypes: class RedditDownloader: def __init__(self, args: Configuration): self.args = args - self.config_directories = appdirs.AppDirs('bulkredditdownloader', 'BDFR') + self.config_directories = appdirs.AppDirs('bdfr', 'BDFR') self.run_time = datetime.now().isoformat() self._setup_internal_objects() @@ -178,7 +178,7 @@ class RedditDownloader: logger.debug(f'Loading configuration from {path}') break if not self.config_location: - self.config_location = list(importlib.resources.path('bulkredditdownloader', 'default_config.cfg').gen)[0] + self.config_location = list(importlib.resources.path('bdfr', 'default_config.cfg').gen)[0] shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) if not self.config_location: raise errors.BulkDownloaderException('Could not find a configuration file to load') diff --git a/bulkredditdownloader/exceptions.py b/bdfr/exceptions.py similarity index 100% rename from bulkredditdownloader/exceptions.py rename to bdfr/exceptions.py diff --git a/bulkredditdownloader/file_name_formatter.py b/bdfr/file_name_formatter.py similarity index 98% rename from bulkredditdownloader/file_name_formatter.py rename to bdfr/file_name_formatter.py index fcd5851..2c083c1 100644 --- a/bulkredditdownloader/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -9,8 +9,8 @@ from typing import Optional from praw.models import Comment, Submission -from bulkredditdownloader.exceptions import BulkDownloaderException -from bulkredditdownloader.resource import Resource +from bdfr.exceptions import BulkDownloaderException +from bdfr.resource import Resource logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/oauth2.py b/bdfr/oauth2.py similarity index 97% rename from bulkredditdownloader/oauth2.py rename to bdfr/oauth2.py index b8ec2c4..505d5bd 100644 --- a/bulkredditdownloader/oauth2.py +++ b/bdfr/oauth2.py @@ -11,7 +11,7 @@ from pathlib import Path import praw import requests -from bulkredditdownloader.exceptions import BulkDownloaderException, RedditAuthenticationError +from bdfr.exceptions import BulkDownloaderException, RedditAuthenticationError logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/resource.py b/bdfr/resource.py similarity index 97% rename from bulkredditdownloader/resource.py rename to bdfr/resource.py index 6577593..6a724d1 100644 --- a/bulkredditdownloader/resource.py +++ b/bdfr/resource.py @@ -11,7 +11,7 @@ import _hashlib import requests from praw.models import Submission -from bulkredditdownloader.exceptions import BulkDownloaderException +from bdfr.exceptions import BulkDownloaderException logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/site_authenticator.py b/bdfr/site_authenticator.py similarity index 100% rename from bulkredditdownloader/site_authenticator.py rename to bdfr/site_authenticator.py diff --git a/bulkredditdownloader/site_downloaders/__init__.py b/bdfr/site_downloaders/__init__.py similarity index 100% rename from bulkredditdownloader/site_downloaders/__init__.py rename to bdfr/site_downloaders/__init__.py diff --git a/bulkredditdownloader/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py similarity index 83% rename from bulkredditdownloader/site_downloaders/base_downloader.py rename to bdfr/site_downloaders/base_downloader.py index 85eee0b..ac45dc3 100644 --- a/bulkredditdownloader/site_downloaders/base_downloader.py +++ b/bdfr/site_downloaders/base_downloader.py @@ -8,9 +8,9 @@ from typing import Optional import requests from praw.models import Submission -from bulkredditdownloader.exceptions import ResourceNotFound -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator +from bdfr.exceptions import ResourceNotFound +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py similarity index 63% rename from bulkredditdownloader/site_downloaders/direct.py rename to bdfr/site_downloaders/direct.py index 6ab3d22..106f251 100644 --- a/bulkredditdownloader/site_downloaders/direct.py +++ b/bdfr/site_downloaders/direct.py @@ -4,9 +4,9 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.resource import Resource +from bdfr.site_downloaders.base_downloader import BaseDownloader class Direct(BaseDownloader): diff --git a/bulkredditdownloader/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py similarity index 63% rename from bulkredditdownloader/site_downloaders/download_factory.py rename to bdfr/site_downloaders/download_factory.py index ba5c7e3..8a39413 100644 --- a/bulkredditdownloader/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -4,18 +4,18 @@ import re from typing import Type -from bulkredditdownloader.exceptions import NotADownloadableLinkError -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.site_downloaders.direct import Direct -from bulkredditdownloader.site_downloaders.erome import Erome -from bulkredditdownloader.site_downloaders.gallery import Gallery -from bulkredditdownloader.site_downloaders.gfycat import Gfycat -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork -from bulkredditdownloader.site_downloaders.imgur import Imgur -from bulkredditdownloader.site_downloaders.redgifs import Redgifs -from bulkredditdownloader.site_downloaders.self_post import SelfPost -from bulkredditdownloader.site_downloaders.vreddit import VReddit -from bulkredditdownloader.site_downloaders.youtube import Youtube +from bdfr.exceptions import NotADownloadableLinkError +from bdfr.site_downloaders.base_downloader import BaseDownloader +from bdfr.site_downloaders.direct import Direct +from bdfr.site_downloaders.erome import Erome +from bdfr.site_downloaders.gallery import Gallery +from bdfr.site_downloaders.gfycat import Gfycat +from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bdfr.site_downloaders.imgur import Imgur +from bdfr.site_downloaders.redgifs import Redgifs +from bdfr.site_downloaders.self_post import SelfPost +from bdfr.site_downloaders.vreddit import VReddit +from bdfr.site_downloaders.youtube import Youtube class DownloadFactory: diff --git a/bulkredditdownloader/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py similarity index 81% rename from bulkredditdownloader/site_downloaders/erome.py rename to bdfr/site_downloaders/erome.py index ae896e2..bd29ea4 100644 --- a/bulkredditdownloader/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -7,10 +7,10 @@ from typing import Optional import bs4 from praw.models import Submission -from bulkredditdownloader.exceptions import SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py similarity index 83% rename from bulkredditdownloader/site_downloaders/gallery.py rename to bdfr/site_downloaders/gallery.py index 829951c..2c59c05 100644 --- a/bulkredditdownloader/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -7,10 +7,10 @@ from typing import Optional import bs4 from praw.models import Submission -from bulkredditdownloader.exceptions import SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py similarity index 81% rename from bulkredditdownloader/site_downloaders/gfycat.py rename to bdfr/site_downloaders/gfycat.py index 62cee25..f140660 100644 --- a/bulkredditdownloader/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -7,10 +7,10 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.exceptions import SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork class Gfycat(GifDeliveryNetwork): diff --git a/bulkredditdownloader/site_downloaders/gif_delivery_network.py b/bdfr/site_downloaders/gif_delivery_network.py similarity index 75% rename from bulkredditdownloader/site_downloaders/gif_delivery_network.py rename to bdfr/site_downloaders/gif_delivery_network.py index 31d5660..2a7f726 100644 --- a/bulkredditdownloader/site_downloaders/gif_delivery_network.py +++ b/bdfr/site_downloaders/gif_delivery_network.py @@ -5,10 +5,10 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader class GifDeliveryNetwork(BaseDownloader): diff --git a/bulkredditdownloader/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py similarity index 90% rename from bulkredditdownloader/site_downloaders/imgur.py rename to bdfr/site_downloaders/imgur.py index 943d27b..3458a45 100644 --- a/bulkredditdownloader/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -7,10 +7,10 @@ from typing import Optional import bs4 from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader class Imgur(BaseDownloader): diff --git a/bulkredditdownloader/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py similarity index 84% rename from bulkredditdownloader/site_downloaders/redgifs.py rename to bdfr/site_downloaders/redgifs.py index 536532e..2436d33 100644 --- a/bulkredditdownloader/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -7,10 +7,10 @@ from typing import Optional from bs4 import BeautifulSoup from praw.models import Submission -from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork class Redgifs(GifDeliveryNetwork): diff --git a/bulkredditdownloader/site_downloaders/self_post.py b/bdfr/site_downloaders/self_post.py similarity index 86% rename from bulkredditdownloader/site_downloaders/self_post.py rename to bdfr/site_downloaders/self_post.py index a141fbb..cb922ee 100644 --- a/bulkredditdownloader/site_downloaders/self_post.py +++ b/bdfr/site_downloaders/self_post.py @@ -5,9 +5,9 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py similarity index 68% rename from bulkredditdownloader/site_downloaders/vreddit.py rename to bdfr/site_downloaders/vreddit.py index 4ace7ac..bff96be 100644 --- a/bulkredditdownloader/site_downloaders/vreddit.py +++ b/bdfr/site_downloaders/vreddit.py @@ -5,9 +5,9 @@ from typing import Optional from praw.models import Submission -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.youtube import Youtube +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.youtube import Youtube logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py similarity index 84% rename from bulkredditdownloader/site_downloaders/youtube.py rename to bdfr/site_downloaders/youtube.py index e8cd696..7b62dc1 100644 --- a/bulkredditdownloader/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -8,10 +8,10 @@ from typing import Optional import youtube_dl from praw.models import Submission -from bulkredditdownloader.exceptions import SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_authenticator import SiteAuthenticator -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader logger = logging.getLogger(__name__) diff --git a/bulkredditdownloader/tests/__init__.py b/bdfr/tests/__init__.py similarity index 100% rename from bulkredditdownloader/tests/__init__.py rename to bdfr/tests/__init__.py diff --git a/bulkredditdownloader/tests/archive_entry/__init__.py b/bdfr/tests/archive_entry/__init__.py similarity index 100% rename from bulkredditdownloader/tests/archive_entry/__init__.py rename to bdfr/tests/archive_entry/__init__.py diff --git a/bulkredditdownloader/tests/archive_entry/test_comment_archive_entry.py b/bdfr/tests/archive_entry/test_comment_archive_entry.py similarity index 92% rename from bulkredditdownloader/tests/archive_entry/test_comment_archive_entry.py rename to bdfr/tests/archive_entry/test_comment_archive_entry.py index 5e9ec69..27dfcb3 100644 --- a/bulkredditdownloader/tests/archive_entry/test_comment_archive_entry.py +++ b/bdfr/tests/archive_entry/test_comment_archive_entry.py @@ -4,7 +4,7 @@ import praw import pytest -from bulkredditdownloader.archive_entry.comment_archive_entry import CommentArchiveEntry +from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry @pytest.mark.online diff --git a/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py b/bdfr/tests/archive_entry/test_submission_archive_entry.py similarity index 92% rename from bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py rename to bdfr/tests/archive_entry/test_submission_archive_entry.py index 6c72702..2b1bb72 100644 --- a/bulkredditdownloader/tests/archive_entry/test_submission_archive_entry.py +++ b/bdfr/tests/archive_entry/test_submission_archive_entry.py @@ -4,7 +4,7 @@ import praw import pytest -from bulkredditdownloader.archive_entry.submission_archive_entry import SubmissionArchiveEntry +from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry @pytest.mark.online diff --git a/bulkredditdownloader/tests/conftest.py b/bdfr/tests/conftest.py similarity index 95% rename from bulkredditdownloader/tests/conftest.py rename to bdfr/tests/conftest.py index 4197989..ce4b681 100644 --- a/bulkredditdownloader/tests/conftest.py +++ b/bdfr/tests/conftest.py @@ -8,7 +8,7 @@ from pathlib import Path import praw import pytest -from bulkredditdownloader.oauth2 import OAuth2TokenManager +from bdfr.oauth2 import OAuth2TokenManager @pytest.fixture(scope='session') diff --git a/bulkredditdownloader/tests/site_downloaders/__init__.py b/bdfr/tests/site_downloaders/__init__.py similarity index 100% rename from bulkredditdownloader/tests/site_downloaders/__init__.py rename to bdfr/tests/site_downloaders/__init__.py diff --git a/bulkredditdownloader/tests/site_downloaders/test_direct.py b/bdfr/tests/site_downloaders/test_direct.py similarity index 86% rename from bulkredditdownloader/tests/site_downloaders/test_direct.py rename to bdfr/tests/site_downloaders/test_direct.py index 3e891b9..790f4c3 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_direct.py +++ b/bdfr/tests/site_downloaders/test_direct.py @@ -5,8 +5,8 @@ from unittest.mock import Mock import pytest -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.direct import Direct +from bdfr.resource import Resource +from bdfr.site_downloaders.direct import Direct @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py b/bdfr/tests/site_downloaders/test_download_factory.py similarity index 68% rename from bulkredditdownloader/tests/site_downloaders/test_download_factory.py rename to bdfr/tests/site_downloaders/test_download_factory.py index 830eeeb..5d6260e 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_download_factory.py +++ b/bdfr/tests/site_downloaders/test_download_factory.py @@ -4,19 +4,19 @@ import praw import pytest -from bulkredditdownloader.exceptions import NotADownloadableLinkError -from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader -from bulkredditdownloader.site_downloaders.direct import Direct -from bulkredditdownloader.site_downloaders.download_factory import DownloadFactory -from bulkredditdownloader.site_downloaders.erome import Erome -from bulkredditdownloader.site_downloaders.gallery import Gallery -from bulkredditdownloader.site_downloaders.gfycat import Gfycat -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork -from bulkredditdownloader.site_downloaders.imgur import Imgur -from bulkredditdownloader.site_downloaders.redgifs import Redgifs -from bulkredditdownloader.site_downloaders.self_post import SelfPost -from bulkredditdownloader.site_downloaders.vreddit import VReddit -from bulkredditdownloader.site_downloaders.youtube import Youtube +from bdfr.exceptions import NotADownloadableLinkError +from bdfr.site_downloaders.base_downloader import BaseDownloader +from bdfr.site_downloaders.direct import Direct +from bdfr.site_downloaders.download_factory import DownloadFactory +from bdfr.site_downloaders.erome import Erome +from bdfr.site_downloaders.gallery import Gallery +from bdfr.site_downloaders.gfycat import Gfycat +from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bdfr.site_downloaders.imgur import Imgur +from bdfr.site_downloaders.redgifs import Redgifs +from bdfr.site_downloaders.self_post import SelfPost +from bdfr.site_downloaders.vreddit import VReddit +from bdfr.site_downloaders.youtube import Youtube @pytest.mark.parametrize(('test_submission_url', 'expected_class'), ( diff --git a/bulkredditdownloader/tests/site_downloaders/test_erome.py b/bdfr/tests/site_downloaders/test_erome.py similarity index 97% rename from bulkredditdownloader/tests/site_downloaders/test_erome.py rename to bdfr/tests/site_downloaders/test_erome.py index 8a052a0..1de9afd 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_erome.py +++ b/bdfr/tests/site_downloaders/test_erome.py @@ -5,7 +5,7 @@ from unittest.mock import MagicMock import pytest -from bulkredditdownloader.site_downloaders.erome import Erome +from bdfr.site_downloaders.erome import Erome @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_gallery.py b/bdfr/tests/site_downloaders/test_gallery.py similarity index 97% rename from bulkredditdownloader/tests/site_downloaders/test_gallery.py rename to bdfr/tests/site_downloaders/test_gallery.py index f61e04f..e903e04 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_gallery.py +++ b/bdfr/tests/site_downloaders/test_gallery.py @@ -4,7 +4,7 @@ import praw import pytest -from bulkredditdownloader.site_downloaders.gallery import Gallery +from bdfr.site_downloaders.gallery import Gallery @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_gfycat.py b/bdfr/tests/site_downloaders/test_gfycat.py similarity index 91% rename from bulkredditdownloader/tests/site_downloaders/test_gfycat.py rename to bdfr/tests/site_downloaders/test_gfycat.py index 8290914..78c37a3 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_gfycat.py +++ b/bdfr/tests/site_downloaders/test_gfycat.py @@ -5,8 +5,8 @@ from unittest.mock import Mock import pytest -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.gfycat import Gfycat +from bdfr.resource import Resource +from bdfr.site_downloaders.gfycat import Gfycat @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py b/bdfr/tests/site_downloaders/test_gif_delivery_network.py similarity index 89% rename from bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py rename to bdfr/tests/site_downloaders/test_gif_delivery_network.py index 9856536..38819c1 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_gif_delivery_network.py +++ b/bdfr/tests/site_downloaders/test_gif_delivery_network.py @@ -5,8 +5,8 @@ from unittest.mock import Mock import pytest -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork +from bdfr.resource import Resource +from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_imgur.py b/bdfr/tests/site_downloaders/test_imgur.py similarity index 95% rename from bulkredditdownloader/tests/site_downloaders/test_imgur.py rename to bdfr/tests/site_downloaders/test_imgur.py index 2f22c2b..ee98c42 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_imgur.py +++ b/bdfr/tests/site_downloaders/test_imgur.py @@ -5,9 +5,9 @@ from unittest.mock import Mock import pytest -from bulkredditdownloader.exceptions import SiteDownloaderError -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.imgur import Imgur +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_downloaders.imgur import Imgur @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_redgifs.py b/bdfr/tests/site_downloaders/test_redgifs.py similarity index 91% rename from bulkredditdownloader/tests/site_downloaders/test_redgifs.py rename to bdfr/tests/site_downloaders/test_redgifs.py index 0d8cb79..a325025 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_redgifs.py +++ b/bdfr/tests/site_downloaders/test_redgifs.py @@ -5,8 +5,8 @@ from unittest.mock import Mock import pytest -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.redgifs import Redgifs +from bdfr.resource import Resource +from bdfr.site_downloaders.redgifs import Redgifs @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_self_post.py b/bdfr/tests/site_downloaders/test_self_post.py similarity index 85% rename from bulkredditdownloader/tests/site_downloaders/test_self_post.py rename to bdfr/tests/site_downloaders/test_self_post.py index 315388f..e3363bb 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_self_post.py +++ b/bdfr/tests/site_downloaders/test_self_post.py @@ -4,8 +4,8 @@ import praw import pytest -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.self_post import SelfPost +from bdfr.resource import Resource +from bdfr.site_downloaders.self_post import SelfPost @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_vreddit.py b/bdfr/tests/site_downloaders/test_vreddit.py similarity index 84% rename from bulkredditdownloader/tests/site_downloaders/test_vreddit.py rename to bdfr/tests/site_downloaders/test_vreddit.py index d7377b6..88b4a02 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_vreddit.py +++ b/bdfr/tests/site_downloaders/test_vreddit.py @@ -4,8 +4,8 @@ import praw import pytest -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.vreddit import VReddit +from bdfr.resource import Resource +from bdfr.site_downloaders.vreddit import VReddit @pytest.mark.online diff --git a/bulkredditdownloader/tests/site_downloaders/test_youtube.py b/bdfr/tests/site_downloaders/test_youtube.py similarity index 86% rename from bulkredditdownloader/tests/site_downloaders/test_youtube.py rename to bdfr/tests/site_downloaders/test_youtube.py index 8c7bfca..0b46f49 100644 --- a/bulkredditdownloader/tests/site_downloaders/test_youtube.py +++ b/bdfr/tests/site_downloaders/test_youtube.py @@ -5,8 +5,8 @@ from unittest.mock import MagicMock import pytest -from bulkredditdownloader.resource import Resource -from bulkredditdownloader.site_downloaders.youtube import Youtube +from bdfr.resource import Resource +from bdfr.site_downloaders.youtube import Youtube @pytest.mark.online diff --git a/bulkredditdownloader/tests/test_archiver.py b/bdfr/tests/test_archiver.py similarity index 93% rename from bulkredditdownloader/tests/test_archiver.py rename to bdfr/tests/test_archiver.py index b0a84c6..622c555 100644 --- a/bulkredditdownloader/tests/test_archiver.py +++ b/bdfr/tests/test_archiver.py @@ -7,8 +7,8 @@ from unittest.mock import MagicMock import praw import pytest -from bulkredditdownloader.archive_entry.submission_archive_entry import SubmissionArchiveEntry -from bulkredditdownloader.archiver import Archiver +from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry +from bdfr.archiver import Archiver @pytest.mark.online diff --git a/bulkredditdownloader/tests/test_configuration.py b/bdfr/tests/test_configuration.py similarity index 90% rename from bulkredditdownloader/tests/test_configuration.py rename to bdfr/tests/test_configuration.py index 94697a3..8ad1663 100644 --- a/bulkredditdownloader/tests/test_configuration.py +++ b/bdfr/tests/test_configuration.py @@ -5,7 +5,7 @@ from unittest.mock import MagicMock import pytest -from bulkredditdownloader.configuration import Configuration +from bdfr.configuration import Configuration @pytest.mark.parametrize('arg_dict', ( diff --git a/bulkredditdownloader/tests/test_download_filter.py b/bdfr/tests/test_download_filter.py similarity index 96% rename from bulkredditdownloader/tests/test_download_filter.py rename to bdfr/tests/test_download_filter.py index 04ea169..3c2adba 100644 --- a/bulkredditdownloader/tests/test_download_filter.py +++ b/bdfr/tests/test_download_filter.py @@ -3,7 +3,7 @@ import pytest -from bulkredditdownloader.download_filter import DownloadFilter +from bdfr.download_filter import DownloadFilter @pytest.fixture() diff --git a/bulkredditdownloader/tests/test_downloader.py b/bdfr/tests/test_downloader.py similarity index 97% rename from bulkredditdownloader/tests/test_downloader.py rename to bdfr/tests/test_downloader.py index 11cc759..9a4f051 100644 --- a/bulkredditdownloader/tests/test_downloader.py +++ b/bdfr/tests/test_downloader.py @@ -10,13 +10,13 @@ import praw import praw.models import pytest -from bulkredditdownloader.__main__ import setup_logging -from bulkredditdownloader.configuration import Configuration -from bulkredditdownloader.download_filter import DownloadFilter -from bulkredditdownloader.downloader import RedditDownloader, RedditTypes -from bulkredditdownloader.exceptions import BulkDownloaderException -from bulkredditdownloader.file_name_formatter import FileNameFormatter -from bulkredditdownloader.site_authenticator import SiteAuthenticator +from bdfr.__main__ import setup_logging +from bdfr.configuration import Configuration +from bdfr.download_filter import DownloadFilter +from bdfr.downloader import RedditDownloader, RedditTypes +from bdfr.exceptions import BulkDownloaderException +from bdfr.file_name_formatter import FileNameFormatter +from bdfr.site_authenticator import SiteAuthenticator @pytest.fixture() diff --git a/bulkredditdownloader/tests/test_file_name_formatter.py b/bdfr/tests/test_file_name_formatter.py similarity index 98% rename from bulkredditdownloader/tests/test_file_name_formatter.py rename to bdfr/tests/test_file_name_formatter.py index a3506e2..ab16abe 100644 --- a/bulkredditdownloader/tests/test_file_name_formatter.py +++ b/bdfr/tests/test_file_name_formatter.py @@ -8,8 +8,8 @@ from unittest.mock import MagicMock import praw.models import pytest -from bulkredditdownloader.file_name_formatter import FileNameFormatter -from bulkredditdownloader.resource import Resource +from bdfr.file_name_formatter import FileNameFormatter +from bdfr.resource import Resource @pytest.fixture() diff --git a/bulkredditdownloader/tests/test_integration.py b/bdfr/tests/test_integration.py similarity index 99% rename from bulkredditdownloader/tests/test_integration.py rename to bdfr/tests/test_integration.py index 23cdbd4..9623b24 100644 --- a/bulkredditdownloader/tests/test_integration.py +++ b/bdfr/tests/test_integration.py @@ -7,7 +7,7 @@ from pathlib import Path import pytest from click.testing import CliRunner -from bulkredditdownloader.__main__ import cli +from bdfr.__main__ import cli @pytest.mark.online diff --git a/bulkredditdownloader/tests/test_oauth2.py b/bdfr/tests/test_oauth2.py similarity index 93% rename from bulkredditdownloader/tests/test_oauth2.py rename to bdfr/tests/test_oauth2.py index e8d71a0..6c25d35 100644 --- a/bulkredditdownloader/tests/test_oauth2.py +++ b/bdfr/tests/test_oauth2.py @@ -7,8 +7,8 @@ from unittest.mock import MagicMock import pytest -from bulkredditdownloader.exceptions import BulkDownloaderException -from bulkredditdownloader.oauth2 import OAuth2Authenticator, OAuth2TokenManager +from bdfr.exceptions import BulkDownloaderException +from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager @pytest.fixture() diff --git a/bulkredditdownloader/tests/test_resource.py b/bdfr/tests/test_resource.py similarity index 95% rename from bulkredditdownloader/tests/test_resource.py rename to bdfr/tests/test_resource.py index 0292f06..0a8e145 100644 --- a/bulkredditdownloader/tests/test_resource.py +++ b/bdfr/tests/test_resource.py @@ -4,7 +4,7 @@ import pytest from unittest.mock import MagicMock -from bulkredditdownloader.resource import Resource +from bdfr.resource import Resource @pytest.mark.parametrize(('test_url', 'expected'), ( diff --git a/setup.cfg b/setup.cfg index 4c907fd..76bcca2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,4 +5,4 @@ author-email = parlakciali@gmail.com python_requires = >=3.9 [files] -packages = bulkredditdownloader +packages = bdfr diff --git a/setup.py b/setup.py index b571f29..40c6185 100644 --- a/setup.py +++ b/setup.py @@ -3,4 +3,4 @@ from setuptools import setup -setup(setup_requires=['pbr', 'appdirs'], pbr=True, data_files=[('config', ['bulkredditdownloader/default_config.cfg'])]) +setup(setup_requires=['pbr', 'appdirs'], pbr=True, data_files=[('config', ['bdfr/default_config.cfg'])]) From e35dd9e5d0de009a96af70331b6c749c9b0034bf Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 18:07:40 +1000 Subject: [PATCH 246/276] Fix bug in file name formatter --- bdfr/file_name_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 2c083c1..d6bdace 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -43,7 +43,7 @@ class FileNameFormatter: result = format_string for key in attributes.keys(): if re.search(fr'(?i).*{{{key}}}.*', result): - key_value = attributes.get(key, 'unknown') + key_value = str(attributes.get(key, 'unknown')) key_value = bytes(key_value, 'utf-8').decode('unicode-escape') result = re.sub(fr'(?i){{{key}}}', key_value, result,) logger.log(9, f'Found key string {key} in name') From 308853d531d19f6bc80afbf66eb496e2b5679d6a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 18:21:04 +1000 Subject: [PATCH 247/276] Use standards in HTTP errors --- bdfr/resource.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index 6a724d1..aaa6944 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -33,22 +33,11 @@ class Resource: response = requests.get(url) if response.status_code == 200: return response.content - elif response.status_code in ( - 301, - 400, - 401, - 403, - 404, - 407, - 410, - 500, - 501, - 502, - ): + elif response.status_code in (408, 429): + raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') + else: raise BulkDownloaderException( f'Unrecoverable error requesting resource: HTTP Code {response.status_code}') - else: - raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') except requests.exceptions.ConnectionError as e: logger.warning(f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}') time.sleep(wait_time) From 7d71f8ffab7d9e1e3b860aefe8239919b97861b9 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 18:59:43 +1000 Subject: [PATCH 248/276] Add regex for all 2xx HTTP codes --- bdfr/resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index aaa6944..ddad278 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -31,7 +31,7 @@ class Resource: wait_time = 60 try: response = requests.get(url) - if response.status_code == 200: + if re.match(r'^2\d{2}', str(response.status_code)) and response.content: return response.content elif response.status_code in (408, 429): raise requests.exceptions.ConnectionError(f'Response code {response.status_code}') From 77bdbbac634cb6126de9f4bac839f9457aa7d81d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 19:00:12 +1000 Subject: [PATCH 249/276] Update test hash --- bdfr/tests/site_downloaders/test_youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/tests/site_downloaders/test_youtube.py b/bdfr/tests/site_downloaders/test_youtube.py index 0b46f49..95bf1ea 100644 --- a/bdfr/tests/site_downloaders/test_youtube.py +++ b/bdfr/tests/site_downloaders/test_youtube.py @@ -12,7 +12,7 @@ from bdfr.site_downloaders.youtube import Youtube @pytest.mark.online @pytest.mark.slow @pytest.mark.parametrize(('test_url', 'expected_hash'), ( - ('https://www.youtube.com/watch?v=uSm2VDgRIUs', '3c79a62898028987f94161e0abccbddf'), + ('https://www.youtube.com/watch?v=uSm2VDgRIUs', 'f70b704b4b78b9bb5cd032bfc26e4971'), ('https://www.youtube.com/watch?v=m-tKnjFwleU', '30314930d853afff8ebc7d8c36a5b833'), )) def test_find_resources(test_url: str, expected_hash: str): From 5758aad48b838c4f0798ae2ba629a5c7133062bf Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 12 Apr 2021 20:41:40 +1000 Subject: [PATCH 250/276] Fix formatting --- bdfr/tests/test_file_name_formatter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bdfr/tests/test_file_name_formatter.py b/bdfr/tests/test_file_name_formatter.py index ab16abe..96252e4 100644 --- a/bdfr/tests/test_file_name_formatter.py +++ b/bdfr/tests/test_file_name_formatter.py @@ -134,7 +134,8 @@ def test_format_full_with_index_suffix( format_string_file: str, index: Optional[int], expected: str, - reddit_submission: praw.models.Submission): + reddit_submission: praw.models.Submission, +): test_resource = Resource(reddit_submission, 'i.reddit.com/blabla.png') test_formatter = FileNameFormatter(format_string_file, format_string_directory) result = test_formatter.format_path(test_resource, Path('test'), index) From 62dedb6c95d2cae82ba3a2b55dd751729f0dc191 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Mon, 12 Apr 2021 21:11:55 +1000 Subject: [PATCH 251/276] Fix bug with emojis in the filename (#263) --- bdfr/file_name_formatter.py | 12 +++++++++++- bdfr/tests/test_file_name_formatter.py | 25 ++++++++++++++++++------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index d6bdace..9e7b335 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -44,7 +44,7 @@ class FileNameFormatter: for key in attributes.keys(): if re.search(fr'(?i).*{{{key}}}.*', result): key_value = str(attributes.get(key, 'unknown')) - key_value = bytes(key_value, 'utf-8').decode('unicode-escape') + key_value = FileNameFormatter._convert_unicode_escapes(key_value) result = re.sub(fr'(?i){{{key}}}', key_value, result,) logger.log(9, f'Found key string {key} in name') @@ -55,6 +55,16 @@ class FileNameFormatter: return result + @staticmethod + def _convert_unicode_escapes(in_string: str) -> str: + pattern = re.compile(r'(\\u\d{4})') + matches = re.search(pattern, in_string) + if matches: + for match in matches.groups(): + converted_match = bytes(match, 'utf-8').decode('unicode-escape') + in_string = in_string.replace(match, converted_match) + return in_string + @staticmethod def _generate_name_dict_from_submission(submission: Submission) -> dict: submission_attributes = { diff --git a/bdfr/tests/test_file_name_formatter.py b/bdfr/tests/test_file_name_formatter.py index 96252e4..fe8372a 100644 --- a/bdfr/tests/test_file_name_formatter.py +++ b/bdfr/tests/test_file_name_formatter.py @@ -297,12 +297,23 @@ def test_multilevel_folder_scheme( assert len(result.parents) == (len(expected.split('/')) + 1) -@pytest.mark.online -@pytest.mark.reddit -@pytest.mark.parametrize(('test_submission_id', 'test_file_scheme', 'expected'), ( - ('mecwk7', '{TITLE}', 'My cat’s paws are so cute'), # Unicode escape in title +@pytest.mark.parametrize(('test_name_string', 'expected'), ( + ('test', 'test'), + ('😍', '😍'), + ('test😍', 'test😍'), + ('test😍 ’', 'test😍 ’'), + ('test😍 \\u2019', 'test😍 ’'), )) -def test_edge_case_names(test_submission_id: str, test_file_scheme: str, expected: str, reddit_instance: praw.Reddit): - test_submission = reddit_instance.submission(id=test_submission_id) - result = FileNameFormatter._format_name(test_submission, test_file_scheme) +def test_preserve_emojis(test_name_string: str, expected: str, submission: MagicMock): + submission.title = test_name_string + result = FileNameFormatter._format_name(submission, '{TITLE}') + assert result == expected + + +@pytest.mark.parametrize(('test_string', 'expected'), ( + ('test \\u2019', 'test ’'), + ('My cat\\u2019s paws are so cute', 'My cat’s paws are so cute'), +)) +def test_convert_unicode_escapes(test_string: str, expected: str): + result = FileNameFormatter._convert_unicode_escapes(test_string) assert result == expected From 4b195f2b53c6f73f79b10915ae25046d4cebaf1c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 13 Apr 2021 11:51:17 +1000 Subject: [PATCH 252/276] Remove unneeded logger entry --- bdfr/file_name_formatter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 9e7b335..9573f2c 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -46,7 +46,6 @@ class FileNameFormatter: key_value = str(attributes.get(key, 'unknown')) key_value = FileNameFormatter._convert_unicode_escapes(key_value) result = re.sub(fr'(?i){{{key}}}', key_value, result,) - logger.log(9, f'Found key string {key} in name') result = result.replace('/', '') From e672e28a12e3d7549b62efa6d44e25e641912411 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 13 Apr 2021 13:17:40 +1000 Subject: [PATCH 253/276] Fix typing on function --- bdfr/resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index ddad278..12c9cc4 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -63,7 +63,7 @@ class Resource: def create_hash(self): self.hash = hashlib.md5(self.content) - def _determine_extension(self) -> str: + def _determine_extension(self) -> Optional[str]: extension_pattern = re.compile(r'.*(\..{3,5})(?:\?.*)?$') match = re.search(extension_pattern, self.url) if match: From ab7a0f6a51f06530bf77dd8f592273136be76653 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 13 Apr 2021 13:22:13 +1000 Subject: [PATCH 254/276] Catch errors when resources have no extension This is related to #266 and will prevent the BDFR from completely crashing when a file extension is unknown --- bdfr/file_name_formatter.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 9573f2c..17713ed 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -132,11 +132,19 @@ class FileNameFormatter: ) -> list[tuple[Path, Resource]]: out = [] if len(resources) == 1: - out.append((self.format_path(resources[0], destination_directory, None), resources[0])) + try: + out.append((self.format_path(resources[0], destination_directory, None), resources[0])) + except BulkDownloaderException as e: + logger.error(f'Could not generate file path for resource {resources[0].url}: {e}') + logger.exception('Could not generate file path') else: for i, res in enumerate(resources, start=1): logger.log(9, f'Formatting filename with index {i}') - out.append((self.format_path(res, destination_directory, i), res)) + try: + out.append((self.format_path(res, destination_directory, i), res)) + except BulkDownloaderException as e: + logger.error(f'Could not generate file path for resource {res.url}: {e}') + logger.exception('Could not generate file path') return out @staticmethod From 59ab5d87778988eec060d478229ab7a3d9b358f6 Mon Sep 17 00:00:00 2001 From: Nathan Spicer-Davis Date: Mon, 12 Apr 2021 23:51:40 -0400 Subject: [PATCH 255/276] Update extension regex to match URI fragments (#264) --- bdfr/resource.py | 2 +- bdfr/tests/test_resource.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bdfr/resource.py b/bdfr/resource.py index 12c9cc4..be6aaaf 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -64,7 +64,7 @@ class Resource: self.hash = hashlib.md5(self.content) def _determine_extension(self) -> Optional[str]: - extension_pattern = re.compile(r'.*(\..{3,5})(?:\?.*)?$') + extension_pattern = re.compile(r'.*(\..{3,5})(?:\?.*)?(?:#.*)?$') match = re.search(extension_pattern, self.url) if match: return match.group(1) diff --git a/bdfr/tests/test_resource.py b/bdfr/tests/test_resource.py index 0a8e145..de6030b 100644 --- a/bdfr/tests/test_resource.py +++ b/bdfr/tests/test_resource.py @@ -15,6 +15,8 @@ from bdfr.resource import Resource ('https://www.resource.com/test/example.jpg', '.jpg'), ('hard.png.mp4', '.mp4'), ('https://preview.redd.it/7zkmr1wqqih61.png?width=237&format=png&auto=webp&s=19de214e634cbcad99', '.png'), + ('test.jpg#test','.jpg'), + ('test.jpg?width=247#test','.jpg'), )) def test_resource_get_extension(test_url: str, expected: str): test_resource = Resource(MagicMock(), test_url) From 48dca9e5eedc92269f4bf0cddf8401909b2b7680 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 13 Apr 2021 16:40:22 +1000 Subject: [PATCH 256/276] Fix mistaken backreference in some titles This should resolve #267 --- bdfr/file_name_formatter.py | 3 ++- bdfr/tests/test_file_name_formatter.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 17713ed..c4bf4b5 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -45,7 +45,8 @@ class FileNameFormatter: if re.search(fr'(?i).*{{{key}}}.*', result): key_value = str(attributes.get(key, 'unknown')) key_value = FileNameFormatter._convert_unicode_escapes(key_value) - result = re.sub(fr'(?i){{{key}}}', key_value, result,) + key_value = key_value.replace('\\', '\\\\') + result = re.sub(fr'(?i){{{key}}}', key_value, result) result = result.replace('/', '') diff --git a/bdfr/tests/test_file_name_formatter.py b/bdfr/tests/test_file_name_formatter.py index fe8372a..7a91d8c 100644 --- a/bdfr/tests/test_file_name_formatter.py +++ b/bdfr/tests/test_file_name_formatter.py @@ -303,6 +303,7 @@ def test_multilevel_folder_scheme( ('test😍', 'test😍'), ('test😍 ’', 'test😍 ’'), ('test😍 \\u2019', 'test😍 ’'), + ('Using that real good [1\\4]', 'Using that real good [1\\4]'), )) def test_preserve_emojis(test_name_string: str, expected: str, submission: MagicMock): submission.title = test_name_string From 9e3e9fa3ef1dbba730164a25cbd5301a251c0335 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 14 Apr 2021 10:06:43 +1000 Subject: [PATCH 257/276] Update CONTRIBUTING --- docs/CONTRIBUTING.md | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 8cec165..54fc234 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -2,20 +2,28 @@ When making a contribution to the BDFR project, please open an issue beforehand so that the maintainers can weigh in on it. This helps create a trail on GitHub and keeps things organised. -If you have a question, **please don't open an issue on GitHub**. There is a subreddit specifically for the BDFR where questions can be asked. If you believe that something is a bug, or that a feature should be added, then by all means open an issue. +If you have a question, **please don't open an issue on GitHub**. There is a discussion tab on the repository's GitHub where you can interact with the developers and ask questions. If you believe that something is a bug, or that a feature should be added, then by all means open an issue. All communication on GitHub, Discord, email, or any other medium must conform to the [Code of Conduct](CODE_OF_CONDUCT.md). It's not that hard to stay respectful. +## Opening an Issue + +When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug. + +If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug. + +In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers. + ## Pull Requests -Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) below before actually writing any code. +Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) section below before actually writing any code. Once you have done both of these, the below list shows the path that should be followed when writing a PR. 1. If an issue does not already exist, open one that will relate to the PR. 2. Ensure that any changes fit into the architecture specified above. 3. Ensure that you have written tests that cover the new code. - 4. Ensure that no existing tests fail, unless there is a good reason for them to do so. If there is, note why in the PR. + 4. Ensure that no existing tests fail, unless there is a good reason for them to do so. If there is, note which tests fail and why this is expected and okay in the PR. 5. If needed, update any documentation with changes. 6. Open a pull request that references the relevant issue. 7. Expect changes or suggestions and heed the Code of Conduct. We're all volunteers here. @@ -44,12 +52,15 @@ Note that the last bracket is on its own line, and that the first bracket has a ### Running Tests -There are a lot of tests in the BDFR. When submitting a PR, it is required that you run **all** possible tests to ensure that any new commits haven't broken anything. Otherwise, while writing the development, it can be helpful (and much quicker) to run only a subset of the tests. +There are a lot of tests in the BDFR. In fact, there are more tests than lines of functional code. This is one of the strengths of the BDFR in that it is fully tested. The codebase uses the package pytest to create the tests, which is a third-party package that provides many functions and objects useful for testing Python code. + +When submitting a PR, it is required that you run **all** possible tests to ensure that any new commits haven't broken anything. Otherwise, while writing the request, it can be helpful (and much quicker) to run only a subset of the tests. This is accomplished with marks, a system that pytest uses to categorise tests. There are currently the current marks in use in the BDFR test suite. - `slow` - This marks a test that may take a long time to complete + - Usually marks a test that downloads many submissions or downloads a particularly large resource - `online` - This marks a test that requires an internet connection and uses online resources - `reddit` @@ -67,9 +78,12 @@ To exclude one or more marks, the following command can be used, substituting th ```bash pytest -m "not online" +pytest -m "not reddit and not authenticated" ``` -Many IDEs also provide integrated methods to run and display the results from tests, and almost all of them support pytest in some capacity. This would be the recommended method due to the additional debugging and general capabilities. +For more details, review the pytest documentation that is freely available online. + +Many IDEs also provide integrated functionality to run and display the results from tests, and almost all of them support pytest in some capacity. This would be the recommended method due to the additional debugging and general capabilities. ### Writing Tests From 52e5120110b76d69f5f54a3ae23a9a106f5036de Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 14 Apr 2021 16:27:30 +1000 Subject: [PATCH 258/276] Update README --- README.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 519bad1..6844d61 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,17 @@ -# Bulk Downloader for Reddit v2-beta +# Bulk Downloader for Reddit v2 [BETA] This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. Some quick reference commands are: - - `python3 -m bdfr download --subreddit Python -L 10` - - `python3 -m bdfr download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}'` - - `python3 -m bdfr download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links` - - `python3 -m bdfr archive --subreddit all --format yaml -L 500 --folder-scheme ''` +```bash +python3 -m bdfr download --subreddit Python -L 10 +python3 -m bdfr download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}' +python3 -m bdfr download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links +python3 -m bdfr archive --subreddit all --format yaml -L 500 --folder-scheme '' +``` + +If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure that your issue is clear and contains everything it needs to for the developers to investigate. ## Usage @@ -26,7 +30,7 @@ Many websites and links are supported for the downloader: - Reddit Text Posts - Reddit Videos - Redgifs - - Youtube + - YouTube ### Options @@ -36,7 +40,7 @@ The following options are common between both the `archive` and `download` comma - This is the directory to which the BDFR will download and place all files - `--authenticate` - This flag will make the BDFR attempt to use an authenticated Reddit session - - See [Authentication](#authentication) for more details + - See [Authentication](#authentication-and-security) for more details - `--config` - If the path to a configuration file is supplied with this option, the BDFR will use the specified config - See [Configuration Files](#configuration) for more details @@ -149,7 +153,7 @@ The following options are for the `archive` command specifically. - `xml` - `yaml` -## Authentication and Secuirity +## Authentication and Security The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. From c85ae3fc69724e7ae1211cab7c9d1a313eb8ad5d Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 14 Apr 2021 16:32:15 +1000 Subject: [PATCH 259/276] Escape characters --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6844d61..ec6ad46 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Bulk Downloader for Reddit v2 [BETA] +# Bulk Downloader for Reddit v2 \[BETA\] This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. From d8752b15fab02249229a0135cb0e8cf5475f1d77 Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sat, 17 Apr 2021 19:56:43 +1000 Subject: [PATCH 260/276] Add option to skip specified subreddits (#268) * Rename variables * Add option to skip specific subreddits * Update README --- README.md | 4 ++++ bdfr/__main__.py | 1 + bdfr/configuration.py | 1 + bdfr/downloader.py | 17 +++++++++++------ bdfr/tests/test_integration.py | 16 ++++++++++++++++ 5 files changed, 33 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ec6ad46..d83da1a 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,10 @@ The following options apply only to the `download` command. This command downloa - `--skip` - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded - Can be specified multiple times +- `--skip-subreddit` + - This skips all submissions from the specified subreddit + - Can be specified multiple times + - Also accepts CSV subreddit names #### Archiver Options diff --git a/bdfr/__main__.py b/bdfr/__main__.py index c58f7ca..26759a1 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -53,6 +53,7 @@ def cli(): @click.option('--search-existing', is_flag=True, default=None) @click.option('--skip', default=None, multiple=True) @click.option('--skip-domain', default=None, multiple=True) +@click.option('--skip-subreddit', default=None, multiple=True) @_add_common_options @click.pass_context def cli_download(context: click.Context, **_): diff --git a/bdfr/configuration.py b/bdfr/configuration.py index e6d0af7..1d9610c 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -27,6 +27,7 @@ class Configuration(Namespace): self.folder_scheme: str = '{SUBREDDIT}' self.skip: list[str] = [] self.skip_domain: list[str] = [] + self.skip_subreddit: list[str] = [] self.sort: str = 'hot' self.submitted: bool = False self.subreddit: list[str] = [] diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 4197b04..4897831 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -93,6 +93,9 @@ class RedditDownloader: self.authenticator = self._create_authenticator() logger.log(9, 'Created site authenticator') + self.args.skip_subreddit = self._split_args_input(self.args.skip_subreddit) + self.args.skip_subreddit = set([sub.lower() for sub in self.args.skip_subreddit]) + def _read_config(self): """Read any cfg values that need to be processed""" if self.args.max_wait_time is None: @@ -210,13 +213,13 @@ class RedditDownloader: return match.group(1) @staticmethod - def _split_args_input(subreddit_entries: list[str]) -> set[str]: - all_subreddits = [] + def _split_args_input(entries: list[str]) -> set[str]: + all_entries = [] split_pattern = re.compile(r'[,;]\s?') - for entry in subreddit_entries: + for entry in entries: results = re.split(split_pattern, entry) - all_subreddits.extend([RedditDownloader._sanitise_subreddit_name(name) for name in results]) - return set(all_subreddits) + all_entries.extend([RedditDownloader._sanitise_subreddit_name(name) for name in results]) + return set(all_entries) def _get_subreddits(self) -> list[praw.models.ListingGenerator]: if self.args.subreddit: @@ -354,8 +357,10 @@ class RedditDownloader: for generator in self.reddit_lists: for submission in generator: if submission.id in self.excluded_submission_ids: - logger.debug(f'Submission {submission.id} in exclusion list, skipping') + logger.debug(f'Object {submission.id} in exclusion list, skipping') continue + elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: + logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') else: logger.debug(f'Attempting to download submission {submission.id}') self._download_submission(submission) diff --git a/bdfr/tests/test_integration.py b/bdfr/tests/test_integration.py index 9623b24..5ab0871 100644 --- a/bdfr/tests/test_integration.py +++ b/bdfr/tests/test_integration.py @@ -284,6 +284,22 @@ def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): assert 'Downloaded submission ' not in result.output +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.parametrize('test_args', ( + ['-l', 'm2601g', '--skip-subreddit', 'trollxchromosomes'], + ['-s', 'trollxchromosomes', '--skip-subreddit', 'trollxchromosomes', '-L', '3'], +)) +def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = ['download', str(tmp_path), '-v', '--config', 'test_config.cfg'] + test_args + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert 'in skip list' in result.output + assert 'Downloaded submission ' not in result.output + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') From 8eb374eec6e1fb902e4369494587ad63a6bc11af Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 17 Apr 2021 13:31:27 +0300 Subject: [PATCH 261/276] test_vreddit: fix incorrect file hash --- bdfr/tests/site_downloaders/test_vreddit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/tests/site_downloaders/test_vreddit.py b/bdfr/tests/site_downloaders/test_vreddit.py index 88b4a02..bae34a3 100644 --- a/bdfr/tests/site_downloaders/test_vreddit.py +++ b/bdfr/tests/site_downloaders/test_vreddit.py @@ -11,7 +11,7 @@ from bdfr.site_downloaders.vreddit import VReddit @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( - ('lu8l8g', '93a15642d2f364ae39f00c6d1be354ff'), + ('lu8l8g', 'c5f8c0ba2ff6e37a14e267a787696cc6'), )) def test_find_resources(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) From 5e81160e5f02a4d791cbd2dd71dd83aecd0a0eec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sat, 17 Apr 2021 14:07:20 +0300 Subject: [PATCH 262/276] test_vreddit: remove flaky test (#272) --- bdfr/tests/site_downloaders/test_vreddit.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bdfr/tests/site_downloaders/test_vreddit.py b/bdfr/tests/site_downloaders/test_vreddit.py index bae34a3..3b663c2 100644 --- a/bdfr/tests/site_downloaders/test_vreddit.py +++ b/bdfr/tests/site_downloaders/test_vreddit.py @@ -10,14 +10,14 @@ from bdfr.site_downloaders.vreddit import VReddit @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_submission_id', 'expected_hash'), ( - ('lu8l8g', 'c5f8c0ba2ff6e37a14e267a787696cc6'), +@pytest.mark.parametrize(('test_submission_id'), ( + ('lu8l8g'), )) -def test_find_resources(test_submission_id: str, expected_hash: str, reddit_instance: praw.Reddit): +def test_find_resources(test_submission_id: str, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) downloader = VReddit(test_submission) resources = downloader.find_resources() assert len(resources) == 1 assert isinstance(resources[0], Resource) resources[0].download(120) - assert resources[0].hash.hexdigest() == expected_hash + assert resources[0].content is not None From f483f24e15caa2be9766b0a8ba696c8aa587f006 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Sat, 17 Apr 2021 14:42:24 +0300 Subject: [PATCH 263/276] test_integration.py: fix skipif test_config --- bdfr/tests/test_integration.py | 37 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/bdfr/tests/test_integration.py b/bdfr/tests/test_integration.py index 5ab0871..47b5229 100644 --- a/bdfr/tests/test_integration.py +++ b/bdfr/tests/test_integration.py @@ -9,10 +9,11 @@ from click.testing import CliRunner from bdfr.__main__ import cli +does_test_config_exist = Path('test_config.cfg').exists() @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-s', 'Mindustry', '-L', 1], ['-s', 'r/Mindustry', '-L', 1], @@ -42,7 +43,7 @@ def test_cli_download_subreddits(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-l', 'm2601g'], ['-l', 'https://www.reddit.com/r/TrollXChromosomes/comments/m2601g/its_a_step_in_the_right_direction/'], @@ -59,7 +60,7 @@ def test_cli_download_links(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10], ['--user', 'helen_darten', '-m', 'cuteanimalpics', '-L', 10, '--sort', 'rising'], @@ -76,7 +77,7 @@ def test_cli_download_multireddit(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'helen_darten', '-m', 'xxyyzzqwerty', '-L', 10], )) @@ -92,7 +93,7 @@ def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Pa @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'me', '--upvoted', '--authenticate', '-L', 10], ['--user', 'me', '--saved', '--authenticate', '-L', 10], @@ -113,7 +114,7 @@ def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit @pytest.mark.authenticated -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'me', '-L', 10, '--folder-scheme', ''], )) @@ -127,7 +128,7 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--subreddit', 'python', '-L', 10, '--search-existing'], )) @@ -142,7 +143,7 @@ def test_cli_download_search_existing(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--subreddit', 'tumblr', '-L', '25', '--skip', 'png', '--skip', 'jpg'], )) @@ -157,7 +158,7 @@ def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--subreddit', 'all', '-L', '100', '--sort', 'new'], )) @@ -170,7 +171,7 @@ def test_cli_download_long(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-l', 'gstd4hk'], ['-l', 'm2601g'], @@ -185,7 +186,7 @@ def test_cli_archive_single(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--subreddit', 'Mindustry', '-L', 25], ['--subreddit', 'Mindustry', '-L', 25, '--format', 'xml'], @@ -204,7 +205,7 @@ def test_cli_archive_subreddit(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'me', '--authenticate', '--all-comments', '-L', '10'], )) @@ -218,7 +219,7 @@ def test_cli_archive_all_user_comments(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--subreddit', 'all', '-L', 100], ['--subreddit', 'all', '-L', 100, '--sort', 'new'], @@ -234,7 +235,7 @@ def test_cli_archive_long(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--user', 'sdclhgsolgjeroij', '--submitted', '-L', 10], ['--user', 'me', '--upvoted', '-L', 10], @@ -250,7 +251,7 @@ def test_cli_download_soft_fail(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit @pytest.mark.slow -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--time', 'random'], ['--sort', 'random'], @@ -271,7 +272,7 @@ def test_cli_download_use_default_config(tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-l', 'm2601g', '--exclude-id', 'm2601g'], )) @@ -286,7 +287,7 @@ def test_cli_download_links_exclusion(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['-l', 'm2601g', '--skip-subreddit', 'trollxchromosomes'], ['-s', 'trollxchromosomes', '--skip-subreddit', 'trollxchromosomes', '-L', '3'], @@ -302,7 +303,7 @@ def test_cli_download_subreddit_exclusion(test_args: list[str], tmp_path: Path): @pytest.mark.online @pytest.mark.reddit -@pytest.mark.skipif(Path('test_config.cfg') is False, reason='A test config file is required for integration tests') +@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests') @pytest.mark.parametrize('test_args', ( ['--file-scheme', '{TITLE}'], ['--file-scheme', '{TITLE}_test_{SUBREDDIT}'], From e8abec43a616b26742a7efd4889ff4b0e7430140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sat, 17 Apr 2021 17:35:50 +0300 Subject: [PATCH 264/276] Adds continuous integration with Github Actions --- .github/workflows/test.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..80e6ee7 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,34 @@ +name: Python Test + +on: + push: + branches: [ v2 ] + pull_request: + branches: [ v2 ] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Setup config file + run: | + cp bdfr/default_config.cfg ./test_config.cfg + echo "" >> ./test_config.cfg + echo "user_token = ${{ secrets.REDDIT_TEST_TOKEN }}" >> ./test_config.cfg + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + - name: Test with pytest + run: | + pytest -m 'not slow' --verbose From e78ecd562606914b1aee78edcafe4cb0d8b34c03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sat, 17 Apr 2021 17:49:44 +0300 Subject: [PATCH 265/276] README.md: Adds CI status badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d83da1a..ce35e26 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # Bulk Downloader for Reddit v2 \[BETA\] +[![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=v2)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml) This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. From 0d407d7a39bea3b8cb7ea5c22e6505d822e22221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 18 Apr 2021 03:26:07 +0300 Subject: [PATCH 266/276] Enhance docs --- README.md | 75 ++++++++++++++++++++++++++++---------------- docs/CONTRIBUTING.md | 32 ++++++++++++++++--- 2 files changed, 76 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index ce35e26..a825624 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,48 @@ # Bulk Downloader for Reddit v2 \[BETA\] [![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=v2)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml) -This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. - -Some quick reference commands are: - -```bash -python3 -m bdfr download --subreddit Python -L 10 -python3 -m bdfr download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}' -python3 -m bdfr download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links -python3 -m bdfr archive --subreddit all --format yaml -L 500 --folder-scheme '' -``` +This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. [List of currently supported sources](#list-of-currently-supported-sources) If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure that your issue is clear and contains everything it needs to for the developers to investigate. +## Installation +*Bulk Downloader for Reddit* **requires** Python 3.9.x and it is distributed via `pip`. Install it as such: +```bash +pip install bdfr +``` + +If you want to use the source code or make contributions, refer to [CONTRIBUTING](docs/CONTRIBUTING.md#preparing-the-environment-for-development) + ## Usage The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. There are two modes to the BDFR: download, and archive. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. -Many websites and links are supported for the downloader: +After installation, run the program from any directory as shown below: +```bash +python3 -m bdfr download +``` +```bash +python3 -m bdfr archive +``` - - Direct links (links leading to a file) - - Erome - - Gfycat - - Gif Delivery Network - - Imgur - - Reddit Galleries - - Reddit Text Posts - - Reddit Videos - - Redgifs - - YouTube +However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. Don't forget that some parameters can be provided multiple times. Some quick reference commands are: -### Options +```bash +python3 -m bdfr download --subreddit Python -L 10 +``` +```bash +python3 -m bdfr download --user me --saved --authenticate -L 25 --file-scheme '{POSTID}' +``` +```bash +python3 -m bdfr download --subreddit 'Python, all, mindustry' -L 10 --make-hard-links +``` +```bash +python3 -m bdfr archive --subreddit all --format yaml -L 500 --folder-scheme '' +``` + +## Options The following options are common between both the `archive` and `download` commands of the BDFR. @@ -103,7 +112,7 @@ The following options are common between both the `archive` and `download` comma - Increases the verbosity of the program - Can be specified multiple times -#### Downloader Options +### Downloader Options The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. @@ -145,7 +154,7 @@ The following options apply only to the `download` command. This command downloa - Can be specified multiple times - Also accepts CSV subreddit names -#### Archiver Options +### Archiver Options The following options are for the `archive` command specifically. @@ -198,8 +207,7 @@ It is highly recommended that the file name scheme contain the parameter `{POSTI ## Configuration The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be: - - `C:\Documents and Settings\\Application Data\Local Settings\BDFR\bdfr` or - - `C:\Documents and Settings\\Application Data\BDFR\bdfr` + - `C:\Users\\AppData\Local\BDFR\bdfr` On Mac OSX, this will be: - `~/Library/Application Support/bdfr`. @@ -223,7 +231,7 @@ All of these should not be modified unless you know what you're doing, as the de Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, increase this number. -#### Rate Limiting +### Rate Limiting The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases. @@ -231,6 +239,19 @@ To this end, the BDFR will sleep for a time before retrying the download, giving The option `--max-wait-time` and the configuration option `max_wait_time` both specify the maximum time the BDFR will wait. If both are present, the command-line option takes precedence. For instance, the default is 120, so the BDFR will wait for 60 seconds, then 120 seconds, and then move one. **Note that this results in a total time of 180 seconds trying the same download**. If you wish to try to bypass the rate-limiting system on the remote site, increasing the maximum wait time may help. However, note that the actual wait times increase exponentially if the resource is not downloaded i.e. specifying a max value of 300 (5 minutes), can make the BDFR pause for 15 minutes on one submission, not 5, in the worst case. +## List of currently supported sources + + - Direct links (links leading to a file) + - Erome + - Gfycat + - Gif Delivery Network + - Imgur + - Reddit Galleries + - Reddit Text Posts + - Reddit Videos + - Redgifs + - YouTube + ## Contributing If you wish to contribute, see [Contributing](docs/CONTRIBUTING.md) for more information. diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 54fc234..94ce748 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -2,17 +2,21 @@ When making a contribution to the BDFR project, please open an issue beforehand so that the maintainers can weigh in on it. This helps create a trail on GitHub and keeps things organised. -If you have a question, **please don't open an issue on GitHub**. There is a discussion tab on the repository's GitHub where you can interact with the developers and ask questions. If you believe that something is a bug, or that a feature should be added, then by all means open an issue. +**Please don't open an issue on GitHub** unless you are reporting a bug or proposing a feature. For questions, there is a discussion tab on the repository's GitHub page where you can interact with the developers and ask questions. If you believe that something is a bug, or that a feature should be added, then by all means open an issue. All communication on GitHub, Discord, email, or any other medium must conform to the [Code of Conduct](CODE_OF_CONDUCT.md). It's not that hard to stay respectful. ## Opening an Issue +**Before opening a new issue**, be sure that no issues regarding your problem already exist. If a similar issue exists, try to contribute to the issue. + +### Bugs When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug. If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug. -In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers. +### Feature requests +In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers. ## Pull Requests @@ -23,12 +27,32 @@ Once you have done both of these, the below list shows the path that should be f 1. If an issue does not already exist, open one that will relate to the PR. 2. Ensure that any changes fit into the architecture specified above. 3. Ensure that you have written tests that cover the new code. - 4. Ensure that no existing tests fail, unless there is a good reason for them to do so. If there is, note which tests fail and why this is expected and okay in the PR. + 4. Ensure that no existing tests fail, unless there is a good reason for them to do so. 5. If needed, update any documentation with changes. 6. Open a pull request that references the relevant issue. 7. Expect changes or suggestions and heed the Code of Conduct. We're all volunteers here. -Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR. +Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR. + +## Preparing the environment for development + +Bulk Downloader for Reddit requires Python 3.9 at minimum. First, ensure that your Python installation satisfies this. + +BDfR is built in a way that it can be packaged and installed via `pip`. This places BDfR next to other Python packages and enables you to run the program from any directory. Since it is managed by pip, you can also uninstall it. + +To install the program, clone the repository and run pip inside the project's root directory: +```bash +$ git clone https://github.com/aliparlakci/bulk-downloader-for-reddit.git +$ cd ./bulk-downloader-for-reddit +$ python3 -m pip install -e . +``` + +**`-e`** parameter creates a link to that folder. That is, any change inside the folder affects the package immidiately. So, when developing, you can be sure that the package is not stale and Python is always running your latest changes. (Due to this linking, moving/removing/renaming the folder might break it) + +Then, you can run the program from anywhere in your disk as such: +```bash +$ python3 -m bdfr +``` ## Style Guide From aefe8b79b63d850e4203259fc7be02e147e44697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Sun, 18 Apr 2021 12:04:51 +0300 Subject: [PATCH 267/276] templates: add issue templates (#276) --- .github/ISSUE_TEMPLATE/bug_report.md | 29 +++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 15 ++++++++++ .../ISSUE_TEMPLATE/site-support-request.md | 18 ++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/ISSUE_TEMPLATE/site-support-request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..87364f4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,29 @@ +--- +name: Bug report +about: Create a report to help us improve +title: "[BUG]" +labels: bug +assignees: '' + +--- + +- [ ] I am reporting a bug. +- [ ] I am running the latest version of BDfR +- [ ] I have read the [Opening an issue](README.md#configuration) + +## Description +A clear and concise description of what the bug is. + +## Command +``` +Paste here the command(s) that causes the bug +``` + +## Environment (please complete the following information): + - OS: [e.g. Windows 10] + - Python version: [e.g. 3.9.4] + +## Logs +``` +Paste the log output here. +``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..fbf7f6b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,15 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "[FEATURE]" +labels: '' +assignees: '' + +--- + +- [ ] I am requesting a feature. +- [ ] I am running the latest version of BDfR +- [ ] I have read the [Opening an issue](README.md#configuration) + +## Description +Clearly state the current situation and issues you experience. Then, explain how this feature would solve these issues and make life easier. Also, explain the feature with as many detail as possible. diff --git a/.github/ISSUE_TEMPLATE/site-support-request.md b/.github/ISSUE_TEMPLATE/site-support-request.md new file mode 100644 index 0000000..8524bd8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/site-support-request.md @@ -0,0 +1,18 @@ +--- +name: Site Support request +about: Describe this issue template's purpose here. +title: "[SITE]" +labels: '' +assignees: '' + +--- + +- [ ] I am requesting a site support. +- [ ] I am running the latest version of BDfR +- [ ] I have read the [Opening an issue](README.md#configuration) + +## Site +Provide a URL to domain of the site. + +## Example posts +Provide example reddit posts with the domain. From b37ff0714f899395145d140a8d4ec19636a22f5a Mon Sep 17 00:00:00 2001 From: Serene <33189705+Serene-Arc@users.noreply.github.com> Date: Sun, 18 Apr 2021 21:24:11 +1000 Subject: [PATCH 268/276] Fix time filters (#279) --- bdfr/downloader.py | 52 ++++++++------ bdfr/resource.py | 6 +- bdfr/site_downloaders/download_factory.py | 35 ++++++---- .../site_downloaders/test_download_factory.py | 11 +++ bdfr/tests/test_downloader.py | 70 +++++++++++++------ bdfr/tests/test_integration.py | 1 - bdfr/tests/test_resource.py | 8 ++- 7 files changed, 121 insertions(+), 62 deletions(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 4897831..c24b5cd 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -41,19 +41,20 @@ def _calc_hash(existing_file: Path): class RedditTypes: class SortType(Enum): - HOT = auto() - RISING = auto() CONTROVERSIAL = auto() + HOT = auto() NEW = auto() RELEVENCE = auto() + RISING = auto() + TOP = auto() class TimeType(Enum): - HOUR = auto() - DAY = auto() - WEEK = auto() - MONTH = auto() - YEAR = auto() - ALL = auto() + ALL = 'all' + DAY = 'day' + HOUR = 'hour' + MONTH = 'month' + WEEK = 'week' + YEAR = 'year' class RedditDownloader: @@ -229,16 +230,16 @@ class RedditDownloader: try: reddit = self.reddit_instance.subreddit(reddit) if self.args.search: - out.append( - reddit.search( - self.args.search, - sort=self.sort_filter.name.lower(), - limit=self.args.limit, - )) + out.append(reddit.search( + self.args.search, + sort=self.sort_filter.name.lower(), + limit=self.args.limit, + time_filter=self.time_filter.value, + )) logger.debug( f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"') else: - out.append(sort_function(reddit, limit=self.args.limit)) + out.append(self._create_filtered_listing_generator(reddit)) logger.debug(f'Added submissions from subreddit {reddit}') except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e: logger.error(f'Failed to get submissions for subreddit {reddit}: {e}') @@ -271,6 +272,8 @@ class RedditDownloader: sort_function = praw.models.Subreddit.rising elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL: sort_function = praw.models.Subreddit.controversial + elif self.sort_filter is RedditTypes.SortType.TOP: + sort_function = praw.models.Subreddit.top else: sort_function = praw.models.Subreddit.hot return sort_function @@ -278,13 +281,12 @@ class RedditDownloader: def _get_multireddits(self) -> list[Iterator]: if self.args.multireddit: out = [] - sort_function = self._determine_sort_function() for multi in self._split_args_input(self.args.multireddit): try: multi = self.reddit_instance.multireddit(self.args.user, multi) if not multi.subreddits: raise errors.BulkDownloaderException - out.append(sort_function(multi, limit=self.args.limit)) + out.append(self._create_filtered_listing_generator(multi)) logger.debug(f'Added submissions from multireddit {multi}') except (errors.BulkDownloaderException, praw.exceptions.PRAWException, prawcore.PrawcoreException) as e: logger.error(f'Failed to get submissions for multireddit {multi}: {e}') @@ -292,6 +294,13 @@ class RedditDownloader: else: return [] + def _create_filtered_listing_generator(self, reddit_source) -> Iterator: + sort_function = self._determine_sort_function() + if self.sort_filter in (RedditTypes.SortType.TOP, RedditTypes.SortType.CONTROVERSIAL): + return sort_function(reddit_source, limit=self.args.limit, time_filter=self.time_filter.value) + else: + return sort_function(reddit_source, limit=self.args.limit) + def _get_user_data(self) -> list[Iterator]: if any([self.args.submitted, self.args.upvoted, self.args.saved]): if self.args.user: @@ -299,14 +308,11 @@ class RedditDownloader: logger.error(f'User {self.args.user} does not exist') return [] generators = [] - sort_function = self._determine_sort_function() if self.args.submitted: logger.debug(f'Retrieving submitted posts of user {self.args.user}') - generators.append( - sort_function( - self.reddit_instance.redditor(self.args.user).submissions, - limit=self.args.limit, - )) + generators.append(self._create_filtered_listing_generator( + self.reddit_instance.redditor(self.args.user).submissions, + )) if not self.authenticated and any((self.args.upvoted, self.args.saved)): logger.warning('Accessing user lists requires authentication') else: diff --git a/bdfr/resource.py b/bdfr/resource.py index be6aaaf..966f5ba 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -6,6 +6,7 @@ import logging import re import time from typing import Optional +import urllib.parse import _hashlib import requests @@ -64,7 +65,8 @@ class Resource: self.hash = hashlib.md5(self.content) def _determine_extension(self) -> Optional[str]: - extension_pattern = re.compile(r'.*(\..{3,5})(?:\?.*)?(?:#.*)?$') - match = re.search(extension_pattern, self.url) + extension_pattern = re.compile(r'.*(\..{3,5})$') + stripped_url = urllib.parse.urlsplit(self.url).path + match = re.search(extension_pattern, stripped_url) if match: return match.group(1) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 8a39413..4bd6225 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -2,6 +2,7 @@ # coding=utf-8 import re +import urllib.parse from typing import Type from bdfr.exceptions import NotADownloadableLinkError @@ -21,30 +22,38 @@ from bdfr.site_downloaders.youtube import Youtube class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: - url_beginning = r'\s*(https?://(www\.)?)' - if re.match(url_beginning + r'(i\.)?imgur.*\.gifv$', url): + sanitised_url = DownloadFactory._sanitise_url(url) + if re.match(r'(i\.)?imgur.*\.gifv$', sanitised_url): return Imgur - elif re.match(url_beginning + r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', url): + elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url): return Direct - elif re.match(url_beginning + r'erome\.com.*', url): + elif re.match(r'erome\.com.*', sanitised_url): return Erome - elif re.match(url_beginning + r'reddit\.com/gallery/.*', url): + elif re.match(r'reddit\.com/gallery/.*', sanitised_url): return Gallery - elif re.match(url_beginning + r'gfycat\.', url): + elif re.match(r'gfycat\.', sanitised_url): return Gfycat - elif re.match(url_beginning + r'gifdeliverynetwork', url): + elif re.match(r'gifdeliverynetwork', sanitised_url): return GifDeliveryNetwork - elif re.match(url_beginning + r'(m\.)?imgur.*', url): + elif re.match(r'(m\.)?imgur.*', sanitised_url): return Imgur - elif re.match(url_beginning + r'redgifs.com', url): + elif re.match(r'redgifs.com', sanitised_url): return Redgifs - elif re.match(url_beginning + r'reddit\.com/r/', url): + elif re.match(r'reddit\.com/r/', sanitised_url): return SelfPost - elif re.match(url_beginning + r'v\.redd\.it', url): + elif re.match(r'v\.redd\.it', sanitised_url): return VReddit - elif re.match(url_beginning + r'(m\.)?youtu\.?be', url): + elif re.match(r'(m\.)?youtu\.?be', sanitised_url): return Youtube - elif re.match(url_beginning + r'i\.redd\.it.*', url): + elif re.match(r'i\.redd\.it.*', sanitised_url): return Direct else: raise NotADownloadableLinkError(f'No downloader module exists for url {url}') + + @staticmethod + def _sanitise_url(url: str) -> str: + beginning_regex = re.compile(r'\s*(www\.?)?') + split_url = urllib.parse.urlsplit(url) + split_url = split_url.netloc + split_url.path + split_url = re.sub(beginning_regex, '', split_url) + return split_url diff --git a/bdfr/tests/site_downloaders/test_download_factory.py b/bdfr/tests/site_downloaders/test_download_factory.py index 5d6260e..65625b7 100644 --- a/bdfr/tests/site_downloaders/test_download_factory.py +++ b/bdfr/tests/site_downloaders/test_download_factory.py @@ -58,3 +58,14 @@ def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownlo def test_factory_lever_bad(test_url: str): with pytest.raises(NotADownloadableLinkError): DownloadFactory.pull_lever(test_url) + + +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('www.test.com/test.png', 'test.com/test.png'), + ('www.test.com/test.png?test_value=random', 'test.com/test.png'), + ('https://youtube.com/watch?v=Gv8Wz74FjVA', 'youtube.com/watch'), + ('https://i.imgur.com/BuzvZwb.gifv', 'i.imgur.com/BuzvZwb.gifv'), +)) +def test_sanitise_urll(test_url: str, expected: str): + result = DownloadFactory._sanitise_url(test_url) + assert result == expected diff --git a/bdfr/tests/test_downloader.py b/bdfr/tests/test_downloader.py index 9a4f051..0d609ef 100644 --- a/bdfr/tests/test_downloader.py +++ b/bdfr/tests/test_downloader.py @@ -148,54 +148,71 @@ def test_get_submissions_from_link( @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_subreddits', 'limit'), ( - (('Futurology',), 10), - (('Futurology', 'Mindustry, Python'), 10), - (('Futurology',), 20), - (('Futurology', 'Python'), 10), - (('Futurology',), 100), - (('Futurology',), 0), +@pytest.mark.parametrize(('test_subreddits', 'limit', 'sort_type', 'time_filter', 'max_expected_len'), ( + (('Futurology',), 10, 'hot', 'all', 10), + (('Futurology', 'Mindustry, Python'), 10, 'hot', 'all', 30), + (('Futurology',), 20, 'hot', 'all', 20), + (('Futurology', 'Python'), 10, 'hot', 'all', 20), + (('Futurology',), 100, 'hot', 'all', 100), + (('Futurology',), 0, 'hot', 'all', 0), + (('Futurology',), 10, 'top', 'all', 10), + (('Futurology',), 10, 'top', 'week', 10), + (('Futurology',), 10, 'hot', 'week', 10), )) def test_get_subreddit_normal( test_subreddits: list[str], limit: int, + sort_type: str, + time_filter: str, + max_expected_len: int, downloader_mock: MagicMock, - reddit_instance: praw.Reddit): + reddit_instance: praw.Reddit, +): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.args.limit = limit + downloader_mock.args.sort = sort_type downloader_mock.args.subreddit = test_subreddits downloader_mock.reddit_instance = reddit_instance - downloader_mock.sort_filter = RedditTypes.SortType.HOT + downloader_mock.sort_filter = RedditDownloader._create_sort_filter(downloader_mock) results = RedditDownloader._get_subreddits(downloader_mock) test_subreddits = downloader_mock._split_args_input(test_subreddits) - results = assert_all_results_are_submissions( - (limit * len(test_subreddits)) if limit else None, results) + results = [sub for res1 in results for sub in res1] + assert all([isinstance(res1, praw.models.Submission) for res1 in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) + assert len(results) <= max_expected_len @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit'), ( - (('Python',), 'scraper', 10), - (('Python',), '', 10), - (('Python',), 'djsdsgewef', 0), +@pytest.mark.parametrize(('test_subreddits', 'search_term', 'limit', 'time_filter', 'max_expected_len'), ( + (('Python',), 'scraper', 10, 'all', 10), + (('Python',), '', 10, 'all', 10), + (('Python',), 'djsdsgewef', 10, 'all', 0), + (('Python',), 'scraper', 10, 'year', 10), + (('Python',), 'scraper', 10, 'hour', 1), )) def test_get_subreddit_search( test_subreddits: list[str], search_term: str, + time_filter: str, limit: int, + max_expected_len: int, downloader_mock: MagicMock, - reddit_instance: praw.Reddit): + reddit_instance: praw.Reddit, +): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.args.limit = limit downloader_mock.args.search = search_term downloader_mock.args.subreddit = test_subreddits downloader_mock.reddit_instance = reddit_instance downloader_mock.sort_filter = RedditTypes.SortType.HOT + downloader_mock.args.time = time_filter + downloader_mock.time_filter = RedditDownloader._create_time_filter(downloader_mock) results = RedditDownloader._get_subreddits(downloader_mock) - results = assert_all_results_are_submissions( - (limit * len(test_subreddits)) if limit else None, results) + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) assert all([res.subreddit.display_name in test_subreddits for res in results]) + assert len(results) <= max_expected_len @pytest.mark.online @@ -210,15 +227,23 @@ def test_get_multireddits_public( test_multireddits: list[str], limit: int, reddit_instance: praw.Reddit, - downloader_mock: MagicMock): + downloader_mock: MagicMock, +): downloader_mock._determine_sort_function.return_value = praw.models.Subreddit.hot downloader_mock.sort_filter = RedditTypes.SortType.HOT downloader_mock.args.limit = limit downloader_mock.args.multireddit = test_multireddits downloader_mock.args.user = test_user downloader_mock.reddit_instance = reddit_instance + downloader_mock._create_filtered_listing_generator.return_value = \ + RedditDownloader._create_filtered_listing_generator( + downloader_mock, + reddit_instance.multireddit(test_user, test_multireddits[0]), + ) results = RedditDownloader._get_multireddits(downloader_mock) - assert_all_results_are_submissions((limit * len(test_multireddits)) if limit else None, results) + results = [sub for res in results for sub in res] + assert all([isinstance(res, praw.models.Submission) for res in results]) + assert len(results) == limit @pytest.mark.online @@ -236,6 +261,11 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic downloader_mock.args.user = test_user downloader_mock.authenticated = False downloader_mock.reddit_instance = reddit_instance + downloader_mock._create_filtered_listing_generator.return_value = \ + RedditDownloader._create_filtered_listing_generator( + downloader_mock, + reddit_instance.redditor(test_user).submissions, + ) results = RedditDownloader._get_user_data(downloader_mock) results = assert_all_results_are_submissions(limit, results) assert all([res.author.name == test_user for res in results]) diff --git a/bdfr/tests/test_integration.py b/bdfr/tests/test_integration.py index 47b5229..396025b 100644 --- a/bdfr/tests/test_integration.py +++ b/bdfr/tests/test_integration.py @@ -101,7 +101,6 @@ def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Pa ['--user', 'djnish', '--submitted', '-L', 10], ['--user', 'djnish', '--submitted', '-L', 10, '--time', 'month'], ['--user', 'djnish', '--submitted', '-L', 10, '--sort', 'controversial'], - ['--user', 'djnish', '--submitted', '-L', 10, '--sort', 'controversial', '--time', 'month'], )) def test_cli_download_user_data_good(test_args: list[str], tmp_path: Path): runner = CliRunner() diff --git a/bdfr/tests/test_resource.py b/bdfr/tests/test_resource.py index de6030b..272c457 100644 --- a/bdfr/tests/test_resource.py +++ b/bdfr/tests/test_resource.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 # coding=utf-8 -import pytest from unittest.mock import MagicMock +import pytest + from bdfr.resource import Resource @@ -15,8 +16,9 @@ from bdfr.resource import Resource ('https://www.resource.com/test/example.jpg', '.jpg'), ('hard.png.mp4', '.mp4'), ('https://preview.redd.it/7zkmr1wqqih61.png?width=237&format=png&auto=webp&s=19de214e634cbcad99', '.png'), - ('test.jpg#test','.jpg'), - ('test.jpg?width=247#test','.jpg'), + ('test.jpg#test', '.jpg'), + ('test.jpg?width=247#test', '.jpg'), + ('https://www.test.com/test/test2/example.png?random=test#thing', '.png'), )) def test_resource_get_extension(test_url: str, expected: str): test_resource = Resource(MagicMock(), test_url) From ee9dec16bf5d90a8ac04bc05604be153215851b4 Mon Sep 17 00:00:00 2001 From: vlad doster Date: Mon, 19 Apr 2021 15:50:08 -0500 Subject: [PATCH 269/276] (feat) test multiple python versions --- .github/workflows/test.yml | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 80e6ee7..20f106f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,27 +8,36 @@ on: jobs: test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + steps: + - uses: actions/checkout@v2 - - name: Set up Python 3.9 + - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest + python -m pip install --upgrade pip flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Setup config file + + - name: Setup test configuration run: | cp bdfr/default_config.cfg ./test_config.cfg - echo "" >> ./test_config.cfg - echo "user_token = ${{ secrets.REDDIT_TEST_TOKEN }}" >> ./test_config.cfg - - name: Lint with flake8 + echo -e "\nuser_token = ${{ secrets.REDDIT_TEST_TOKEN }}" >> ./test_config.cfg + + - name: Lint w/ flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Test with pytest + + - name: Test w/ PyTest run: | pytest -m 'not slow' --verbose From 71664dc70a7ae5639a7e38781516780177ae1f57 Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Tue, 20 Apr 2021 14:32:59 +0300 Subject: [PATCH 270/276] test.yml: remove unsupported Python versions --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 20f106f..43e14a9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ jobs: strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: [3.9] steps: From 61489dc73dd4a95b2463afd8eddd1d44686a6819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 20 Apr 2021 16:04:41 +0300 Subject: [PATCH 271/276] README.md: clarify python requirement --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a825624..c6a4c9c 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ This is a tool to download submissions or submission data from Reddit. It can be If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure that your issue is clear and contains everything it needs to for the developers to investigate. ## Installation -*Bulk Downloader for Reddit* **requires** Python 3.9.x and it is distributed via `pip`. Install it as such: +*Bulk Downloader for Reddit* needs Python version 3.9 or above. Please update Python before installation to meet the requirement. Then, you can install it as such: ```bash -pip install bdfr +python3 -m pip install bdfr ``` If you want to use the source code or make contributions, refer to [CONTRIBUTING](docs/CONTRIBUTING.md#preparing-the-environment-for-development) From b355fb6500f367d26d7d19bdd2722fa28c7a4ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 20 Apr 2021 16:43:25 +0300 Subject: [PATCH 272/276] test.yml: add coverage report (#282) --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 43e14a9..426bdca 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: - name: Install dependencies run: | - python -m pip install --upgrade pip flake8 pytest + python -m pip install --upgrade pip flake8 pytest pytest-cov if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Setup test configuration @@ -40,4 +40,4 @@ jobs: - name: Test w/ PyTest run: | - pytest -m 'not slow' --verbose + pytest -m 'not slow' --verbose --cov=./bdfr/ --cov-report term:skip-covered From 92c7f9bcff3a52526f52bd6bbe6c1b32980c5778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 20 Apr 2021 17:15:26 +0300 Subject: [PATCH 273/276] test.yml: upload coverage report (#283) --- .github/workflows/test.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 426bdca..d6bfa4b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,4 +40,10 @@ jobs: - name: Test w/ PyTest run: | - pytest -m 'not slow' --verbose --cov=./bdfr/ --cov-report term:skip-covered + pytest -m 'not slow' --verbose --cov=./bdfr/ --cov-report term:skip-covered --cov-report html + + - name: Upload coverage report + uses: actions/upload-artifact@v2 + with: + name: coverage_report + path: htmlcov/ From 298509c7fa3991a5b4b8d9c8ce823f842653bcae Mon Sep 17 00:00:00 2001 From: Ali Parlakci Date: Tue, 20 Apr 2021 18:57:02 +0300 Subject: [PATCH 274/276] setup.cfg: add more metadata --- setup.cfg | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 76bcca2..3b57d7a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,23 @@ [metadata] -name = Bulk Downloader for Reddit +name = bdfr +description_file = README.md +description_content_type = text/markdown +home_page = https://github.com/aliparlakci/bulk-downloader-for-reddit +keywords = reddit, download, archive +version = 2.0.3 author = Ali Parlakci -author-email = parlakciali@gmail.com -python_requires = >=3.9 +author_email = parlakciali@gmail.com +maintainer = Serene Arc +maintainer_email = serenical@gmail.com +license = GPLv3 +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: GNU General Public License v3 (GPLv3) + Natural Language :: English + Environment :: Console + Operating System :: OS Independent +requires_python = >=3.9 +platforms = any [files] packages = bdfr From c0cef487bd68b528edd430c55bf27117a0533e1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 20 Apr 2021 19:01:32 +0300 Subject: [PATCH 275/276] README.md: drop the beta --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c6a4c9c..2af82ca 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Bulk Downloader for Reddit v2 \[BETA\] +# Bulk Downloader for Reddit v2 [![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=v2)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml) This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. [List of currently supported sources](#list-of-currently-supported-sources) From 44ad34901c93edeb1e8e92e1ce853f44977adbd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Tue, 20 Apr 2021 19:01:57 +0300 Subject: [PATCH 276/276] README.md: drop v2 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2af82ca..25057ff 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Bulk Downloader for Reddit v2 +# Bulk Downloader for Reddit [![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=v2)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml) This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. [List of currently supported sources](#list-of-currently-supported-sources)