* IMGUR API is no longer used

* --skip now accepts file types instead of domain

* --skip-domain added

* --no-download added

* --no-dupe now supports YouTube

* Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together

* Invalid characters in MacOS and Linux platforms are removed from filenames

* Bug fixes
This commit is contained in:
Ali Parlakçı
2020-06-03 18:10:25 +03:00
committed by GitHub
parent 540e1e8a6e
commit af1f9fd365
14 changed files with 277 additions and 298 deletions

View File

@@ -102,6 +102,13 @@ class Arguments:
type=str)
parser.add_argument("--skip",
nargs="+",
help="Skip posts with given type",
type=str,
choices=["images","videos","gifs","self"],
default=[])
parser.add_argument("--skip-domain",
nargs="+",
help="Skip posts with given domain",
type=str,
@@ -140,7 +147,13 @@ class Arguments:
parser.add_argument("--downloaded-posts",
help="Use a hash file to keep track of downloaded files",
type=str
)
)
parser.add_argument("--no-download",
action="store_true",
help="Just saved posts into a the POSTS.json file without downloading"
)
if arguments == []:
return parser.parse_args()

View File

@@ -5,6 +5,7 @@ import random
from src.reddit import Reddit
from src.jsonHelper import JsonFile
from src.utils import nameCorrector
class Config():
@@ -36,7 +37,7 @@ For example: {FLAIR}_{SUBREDDIT}_{REDDITOR}
Existing filename template:""", None if "filename" not in self.file.read() else self.file.read()["filename"])
filename = input(">> ").upper()
filename = nameCorrector(input(">> ").upper())
self.file.add({
"filename": filename
})
@@ -68,7 +69,7 @@ For example: {REDDITOR}/{SUBREDDIT}/{FLAIR}
Existing folder structure""", None if "folderpath" not in self.file.read() else self.file.read()["folderpath"])
folderpath = input(">> ").strip("\\").strip("/").upper()
folderpath = nameCorrector(input(">> ").strip("\\").strip("/").upper())
self.file.add({
"folderpath": folderpath
@@ -105,8 +106,6 @@ Existing default options:""", None if "options" not in self.file.read() else sel
def _validateCredentials(self):
"""Read credentials from config.json file"""
keys = ['imgur_client_id',
'imgur_client_secret']
try:
content = self.file.read()["credentials"]
except:
@@ -119,25 +118,7 @@ Existing default options:""", None if "options" not in self.file.read() else sel
pass
else:
Reddit().begin()
if not all(content.get(key,False) for key in keys):
print(
"---Setting up the Imgur API---\n\n" \
"Go to this URL and fill the form:\n" \
"https://api.imgur.com/oauth2/addclient\n" \
"Then, enter the client id and client secret here\n" \
"Press Enter to open the link in the browser"
)
input()
webbrowser.open("https://api.imgur.com/oauth2/addclient",new=2)
for key in keys:
try:
if content[key] == "":
raise KeyError
except KeyError:
self.file.add({key:input("\t"+key+": ")},
"credentials")
print()
def setDefaultDirectory(self):

View File

@@ -1,137 +1,142 @@
import urllib
import json
import os
import time
import requests
import imgurpython
from src.downloaders.downloaderUtils import getExtension, getFile
from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError,
FileNameTooLong)
from src.utils import GLOBAL, nameCorrector
from src.utils import printToFile as print
from src.downloaders.Direct import Direct
from src.downloaders.downloaderUtils import getFile
from src.errors import FileNotFoundError, FileAlreadyExistsError, AlbumNotDownloadedCompletely, ImageNotFound, ExtensionError, NotADownloadableLinkError, TypeInSkip
class Imgur:
def __init__(self,directory,post):
self.imgurClient = self.initImgur()
imgurID = self.getId(post['CONTENTURL'])
content = self.getLink(imgurID)
IMGUR_IMAGE_DOMAIN = "https://i.imgur.com/"
if not os.path.exists(directory): os.makedirs(directory)
def __init__(self,directory, post):
if content['type'] == 'image':
link = post['CONTENTURL']
try:
post['MEDIAURL'] = content['object'].mp4
except AttributeError:
post['MEDIAURL'] = content['object'].link
if link.endswith(".gifv"):
link = link.replace(".gifv",".mp4")
Direct(directory, {**post, 'CONTENTURL': link})
return None
post['EXTENSION'] = getExtension(post['MEDIAURL'])
self.rawData = self.getData(link)
filename = GLOBAL.config['filename'].format(**post)+post["EXTENSION"]
shortFilename = post['POSTID']+post['EXTENSION']
getFile(filename,shortFilename,directory,post['MEDIAURL'])
elif content['type'] == 'album':
images = content['object'].images
imagesLenght = len(images)
howManyDownloaded = imagesLenght
duplicates = 0
filename = GLOBAL.config['filename'].format(**post)
print(filename)
folderDir = directory / filename
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = directory / post['POSTID']
os.makedirs(folderDir)
for i in range(imagesLenght):
try:
imageURL = images[i]['mp4']
except KeyError:
imageURL = images[i]['link']
images[i]['Ext'] = getExtension(imageURL)
filename = (str(i+1)
+ "_"
+ nameCorrector(str(images[i]['title']))
+ "_"
+ images[i]['id'])
shortFilename = (str(i+1) + "_" + images[i]['id'])
print("\n ({}/{})".format(i+1,imagesLenght))
try:
getFile(filename,shortFilename,folderDir,imageURL,indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " "*10,end="\n\n")
duplicates += 1
howManyDownloaded -= 1
except Exception as exception:
print("\n Could not get the file")
print(
" "
+ "{class_name}: {info}".format(
class_name=exception.__class__.__name__,
info=str(exception)
)
+ "\n"
)
howManyDownloaded -= 1
if duplicates == imagesLenght:
raise FileAlreadyExistsError
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
@staticmethod
def initImgur():
"""Initialize imgur api"""
config = GLOBAL.config
return imgurpython.ImgurClient(
config["credentials"]['imgur_client_id'],
config["credentials"]['imgur_client_secret']
)
def getId(self,submissionURL):
"""Extract imgur post id
and determine if its a single image or album
"""
if submissionURL[-1] == "/":
submissionURL = submissionURL[:-1]
if "a/" in submissionURL or "gallery/" in submissionURL:
albumId = submissionURL.split("/")[-1]
return {'id':albumId, 'type':'album'}
self.directory = directory
self.post = post
if self.isAlbum:
if self.rawData["album_images"]["count"] != 1:
self.downloadAlbum(self.rawData["album_images"])
else:
self.download(self.rawData["album_images"]["images"][0])
else:
url = submissionURL.replace('.','/').split('/')
imageId = url[url.index('com')+1]
return {'id':imageId, 'type':'image'}
self.download(self.rawData)
def getLink(self,identity):
"""Request imgur object from imgur api
"""
def downloadAlbum(self, images):
folderName = GLOBAL.config['filename'].format(**self.post)
folderDir = self.directory / folderName
imagesLenght = images["count"]
howManyDownloaded = 0
duplicates = 0
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = self.directory / self.post['POSTID']
os.makedirs(folderDir)
print(folderName)
for i in range(imagesLenght):
extension = self.validateExtension(images["images"][i]["ext"])
imageURL = self.IMGUR_IMAGE_DOMAIN + images["images"][i]["hash"] + extension
filename = "_".join([
str(i+1), nameCorrector(images["images"][i]['title']), images["images"][i]['hash']
]) + extension
shortFilename = str(i+1) + "_" + images["images"][i]['hash']
print("\n ({}/{})".format(i+1,imagesLenght))
try:
getFile(filename,shortFilename,folderDir,imageURL,indent=2)
howManyDownloaded += 1
print()
except FileAlreadyExistsError:
print(" The file already exists" + " "*10,end="\n\n")
duplicates += 1
except TypeInSkip:
print(" Skipping...")
howManyDownloaded += 1
except Exception as exception:
print("\n Could not get the file")
print(
" "
+ "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__,
info=str(exception)
)
+ "\n"
)
print(GLOBAL.log_stream.getvalue(),noPrint=True)
if duplicates == imagesLenght:
raise FileAlreadyExistsError
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
def download(self, image):
extension = self.validateExtension(image["ext"])
imageURL = self.IMGUR_IMAGE_DOMAIN + image["hash"] + extension
filename = GLOBAL.config['filename'].format(**self.post) + extension
shortFilename = self.post['POSTID']+extension
getFile(filename,shortFilename,self.directory,imageURL)
@property
def isAlbum(self):
return "album_images" in self.rawData
@staticmethod
def getData(link):
cookies = {"over18": "1"}
res = requests.get(link, cookies=cookies)
if res.status_code != 200: raise ImageNotFound(f"Server responded with {res.status_code} to {link}")
pageSource = requests.get(link, cookies=cookies).text
STARTING_STRING = "image : "
ENDING_STRING = "group :"
STARTING_STRING_LENGHT = len(STARTING_STRING)
try:
startIndex = pageSource.index(STARTING_STRING) + STARTING_STRING_LENGHT
endIndex = pageSource.index(ENDING_STRING)
except ValueError:
raise NotADownloadableLinkError(f"Could not read the page source on {link}")
data = pageSource[startIndex:endIndex].strip()[:-1]
return json.loads(data)
if identity['type'] == 'image':
return {'object':self.imgurClient.get_image(identity['id']),
'type':'image'}
elif identity['type'] == 'album':
return {'object':self.imgurClient.get_album(identity['id']),
'type':'album'}
@staticmethod
def get_credits():
return Imgur.initImgur().get_credits()
def validateExtension(string):
POSSIBLE_EXTENSIONS = [".jpg", ".png", ".mp4", ".gif"]
for extension in POSSIBLE_EXTENSIONS:
if extension in string: return extension
else: raise ExtensionError(f"\"{string}\" is not recognized as a valid extension.")

View File

@@ -8,7 +8,7 @@ import hashlib
from src.utils import nameCorrector, GLOBAL
from src.utils import printToFile as print
from src.errors import FileAlreadyExistsError, FileNameTooLong, FailedToDownload, DomainInSkip
from src.errors import FileAlreadyExistsError, FileNameTooLong, FailedToDownload, TypeInSkip, DomainInSkip
def dlProgress(count, blockSize, totalSize):
"""Function for writing download progress to console
@@ -37,7 +37,18 @@ def getExtension(link):
def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):
if any(domain in imageURL for domain in GLOBAL.arguments.skip):
FORMATS = {
"videos": [".mp4", ".webm"],
"images": [".jpg",".jpeg",".png",".bmp"],
"gifs": [".gif"]
}
for type in GLOBAL.arguments.skip:
for extension in FORMATS[type]:
if extension in filename:
raise TypeInSkip
if any(domain in imageURL for domain in GLOBAL.arguments.skip_domain):
raise DomainInSkip
headers = [
@@ -52,13 +63,13 @@ def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):
("Connection", "keep-alive")
]
if not os.path.exists(folderDir): os.makedirs(folderDir)
opener = urllib.request.build_opener()
if not "imgur" in imageURL:
opener.addheaders = headers
urllib.request.install_opener(opener)
filename = nameCorrector(filename)
if not silent: print(" "*indent + str(folderDir),
" "*indent + str(filename),
sep="\n")
@@ -74,12 +85,12 @@ def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):
tempDir,
reporthook=dlProgress)
fileHash = createHash(tempDir)
if GLOBAL.arguments.no_dupes:
fileHash = createHash(tempDir)
if fileHash in GLOBAL.hashList:
if fileHash in GLOBAL.downloadedPosts():
os.remove(tempDir)
raise FileAlreadyExistsError
GLOBAL.hashList.add(fileHash)
GLOBAL.downloadedPosts.add(fileHash)
os.rename(tempDir,fileDir)
if not silent: print(" "*indent+"Downloaded"+" "*10)

View File

@@ -2,7 +2,7 @@ import io
import os
from pathlib import Path
from src.errors import FileAlreadyExistsError
from src.errors import FileAlreadyExistsError, TypeInSkip
from src.utils import GLOBAL
VanillaPrint = print
@@ -10,6 +10,9 @@ from src.utils import printToFile as print
class SelfPost:
def __init__(self,directory,post):
if "self" in GLOBAL.arguments.skip: raise TypeInSkip
if not os.path.exists(directory): os.makedirs(directory)
filename = GLOBAL.config['filename'].format(**post)

View File

@@ -36,10 +36,10 @@ class Youtube:
fileHash = createHash(location)
except FileNotFoundError:
return None
if fileHash in GLOBAL.hashList:
if fileHash in GLOBAL.downloadedPosts():
os.remove(location)
raise FileAlreadyExistsError
GLOBAL.hashList.add(fileHash)
GLOBAL.downloadedPosts.add(fileHash)
@staticmethod
def _hook(d):

View File

@@ -99,5 +99,14 @@ class InvalidJSONFile(Exception):
class FailedToDownload(Exception):
pass
class TypeInSkip(Exception):
pass
class DomainInSkip(Exception):
pass
class ImageNotFound(Exception):
pass
class ExtensionError(Exception):
pass

View File

@@ -201,7 +201,7 @@ def extractDetails(posts,SINGLE_POST=False):
"""
postList = []
postCount = 0
postCount = 1
allPosts = {}
@@ -227,18 +227,17 @@ def extractDetails(posts,SINGLE_POST=False):
except AttributeError:
pass
result = matchWithDownloader(submission)
if not any(domain in submission.domain for domain in GLOBAL.arguments.skip_domain):
result = matchWithDownloader(submission)
if result is not None:
details = {**details, **result}
postList.append(details)
postsFile.add({postCount:details})
if result is not None:
details = {**details, **result}
postList.append(details)
postsFile.add({postCount:details})
else:
try:
for submission in posts:
postCount += 1
if postCount % 100 == 0:
sys.stdout.write("")
@@ -264,13 +263,18 @@ def extractDetails(posts,SINGLE_POST=False):
except AttributeError:
continue
result = matchWithDownloader(submission)
if details['POSTID'] in GLOBAL.downloadedPosts(): continue
if result is not None:
details = {**details, **result}
postList.append(details)
if not any(domain in submission.domain for domain in GLOBAL.arguments.skip_domain):
result = matchWithDownloader(submission)
allPosts[postCount] = details
if result is not None:
details = {**details, **result}
postList.append(details)
allPosts[postCount] = details
postCount += 1
except KeyboardInterrupt:
print("\nKeyboardInterrupt",noPrint=True)
@@ -284,6 +288,11 @@ def extractDetails(posts,SINGLE_POST=False):
def matchWithDownloader(submission):
directLink = extractDirectLink(submission.url)
if directLink:
return {'TYPE': 'direct',
'CONTENTURL': directLink}
if 'v.redd.it' in submission.domain:
bitrates = ["DASH_1080","DASH_720","DASH_600", \
"DASH_480","DASH_360","DASH_240"]
@@ -291,7 +300,7 @@ def matchWithDownloader(submission):
for bitrate in bitrates:
videoURL = submission.url+"/"+bitrate
try:
try:
responseCode = urllib.request.urlopen(videoURL).getcode()
except urllib.error.HTTPError:
responseCode = 0
@@ -327,12 +336,6 @@ def matchWithDownloader(submission):
return {'TYPE': 'self',
'CONTENT': submission.selftext}
try:
return {'TYPE': 'direct',
'CONTENTURL': extractDirectLink(submission.url)}
except DirectLinkNotFound:
return None
def extractDirectLink(URL):
"""Check if link is a direct image link.
If so, return URL,
@@ -346,26 +349,8 @@ def extractDirectLink(URL):
if "i.reddituploads.com" in URL:
return URL
elif "v.redd.it" in URL:
bitrates = ["DASH_1080","DASH_720","DASH_600", \
"DASH_480","DASH_360","DASH_240"]
for bitrate in bitrates:
videoURL = URL+"/"+bitrate
try:
responseCode = urllib.request.urlopen(videoURL).getcode()
except urllib.error.HTTPError:
responseCode = 0
if responseCode == 200:
return videoURL
else:
raise DirectLinkNotFound
for extension in imageTypes:
if extension in URL.split("/")[-1]:
return URL
else:
raise DirectLinkNotFound
return None

View File

@@ -17,8 +17,8 @@ class Store:
def __call__(self):
return self.list
def add(self, filehash):
self.list.append(filehash)
def add(self, data):
self.list.append(data)
if self.directory:
with open(self.directory, 'a') as f:
f.write("{filehash}\n".format(filehash=filehash))
f.write("{data}\n".format(data=data))

View File

@@ -18,10 +18,11 @@ class GLOBAL:
configDirectory = ""
reddit_client_id = "U-6gk4ZCh3IeNQ"
reddit_client_secret = "7CZHY6AmKweZME5s50SfDGylaPg"
hashList = set()
downloadedPosts = lambda: []
printVanilla = print
log_stream= None
def createLogFile(TITLE):
"""Create a log file with given name
inside a folder time stampt in its name and
@@ -63,34 +64,32 @@ def printToFile(*args, noPrint=False,**kwargs):
) as FILE:
print(*args, file=FILE, **kwargs)
def nameCorrector(string):
def nameCorrector(string,reference=None):
"""Swap strange characters from given string
with underscore (_) and shorten it.
Return the string
"""
stringLenght = len(string)
if stringLenght > 200:
string = string[:200]
stringLenght = len(string)
spacesRemoved = []
LIMIT = 247
for b in range(stringLenght):
if string[b] == " ":
spacesRemoved.append("_")
else:
spacesRemoved.append(string[b])
string = ''.join(spacesRemoved)
stringLength = len(string)
if reference:
referenceLenght = len(reference)
totalLenght = referenceLenght
else:
totalLenght = stringLength
if totalLenght > LIMIT:
limit = LIMIT - referenceLenght
string = string[:limit-1]
string = string.replace(" ", "_")
if len(string.split('\n')) > 1:
string = "".join(string.split('\n'))
BAD_CHARS = ['\\','/',':','*','?','"','<','>','|','#']
if any(x in string for x in BAD_CHARS):
for char in string:
if char in BAD_CHARS:
string = string.replace(char,"_")
BAD_CHARS = ['\\','/',':','*','?','"','<','>','|','#', '.', '@' ,'', '', '\'', '!']
string = "".join([i if i not in BAD_CHARS else "_" for i in string])
return string