v1.9.0 (#114)

* IMGUR API is no longer used * --skip now accepts file types instead of domain * --skip-domain added * --no-download added * --no-dupe now supports YouTube * Duplicates of older posts will not be dowloaded if --no-dupe and --downloaded-posts options are given together * Invalid characters in MacOS and Linux platforms are removed from filenames * Bug fixes
2020-06-03 18:10:25 +03:00
parent 540e1e8a6e
commit af1f9fd365
14 changed files with 277 additions and 298 deletions
--- a/src/arguments.py
+++ b/src/arguments.py
@@ -102,6 +102,13 @@ class Arguments:
                            type=str)

        parser.add_argument("--skip",
+                            nargs="+",
+                            help="Skip posts with given type",
+                            type=str,
+                            choices=["images","videos","gifs","self"],
+                            default=[])   
+
+        parser.add_argument("--skip-domain",
                            nargs="+",
                            help="Skip posts with given domain",
                            type=str,
@@ -140,7 +147,13 @@ class Arguments:
        parser.add_argument("--downloaded-posts",
                            help="Use a hash file to keep track of downloaded files",
                            type=str
-                            ) 
+                            )
+
+        parser.add_argument("--no-download",
+                            action="store_true",
+                            help="Just saved posts into a the POSTS.json file without downloading"
+                            )
+   

        if arguments == []:
            return parser.parse_args()
--- a/src/config.py
+++ b/src/config.py
@@ -5,6 +5,7 @@ import random

 from src.reddit import Reddit
 from src.jsonHelper import JsonFile
+from src.utils import nameCorrector

 class Config():

@@ -36,7 +37,7 @@ For example: {FLAIR}_{SUBREDDIT}_{REDDITOR}

 Existing filename template:""", None if "filename" not in self.file.read() else self.file.read()["filename"])

-        filename = input(">> ").upper()
+        filename = nameCorrector(input(">> ").upper())
        self.file.add({
            "filename": filename
        })
@@ -68,7 +69,7 @@ For example: {REDDITOR}/{SUBREDDIT}/{FLAIR}

 Existing folder structure""", None if "folderpath" not in self.file.read() else self.file.read()["folderpath"])

-        folderpath = input(">> ").strip("\\").strip("/").upper()
+        folderpath = nameCorrector(input(">> ").strip("\\").strip("/").upper())

        self.file.add({
            "folderpath": folderpath
@@ -105,8 +106,6 @@ Existing default options:""", None if "options" not in self.file.read() else sel
    def _validateCredentials(self):
        """Read credentials from config.json file"""

-        keys = ['imgur_client_id',
-                'imgur_client_secret']
        try:
            content = self.file.read()["credentials"]
        except:
@@ -119,25 +118,7 @@ Existing default options:""", None if "options" not in self.file.read() else sel
            pass
        else:
            Reddit().begin()
-
-        if not all(content.get(key,False) for key in keys):
-            print(
-                "---Setting up the Imgur API---\n\n" \
-                "Go to this URL and fill the form:\n" \
-                "https://api.imgur.com/oauth2/addclient\n" \
-                "Then, enter the client id and client secret here\n" \
-                "Press Enter to open the link in the browser"
-            )
-            input()
-            webbrowser.open("https://api.imgur.com/oauth2/addclient",new=2)
-
-        for key in keys:
-            try:
-                if content[key] == "":
-                    raise KeyError
-            except KeyError:
-                self.file.add({key:input("\t"+key+": ")},
-                              "credentials")
+            
        print()

    def setDefaultDirectory(self):
--- a/src/downloaders/Imgur.py
+++ b/src/downloaders/Imgur.py
@@ -1,137 +1,142 @@
+import urllib
+import json
 import os
+import time
+import requests

-import imgurpython
-
-from src.downloaders.downloaderUtils import getExtension, getFile
-from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError,
-                        FileNameTooLong)
 from src.utils import GLOBAL, nameCorrector
 from src.utils import printToFile as print
-
+from src.downloaders.Direct import Direct
+from src.downloaders.downloaderUtils import getFile
+from src.errors import FileNotFoundError, FileAlreadyExistsError, AlbumNotDownloadedCompletely, ImageNotFound, ExtensionError, NotADownloadableLinkError, TypeInSkip

 class Imgur:
-    def __init__(self,directory,post):
-        self.imgurClient = self.initImgur()

-        imgurID = self.getId(post['CONTENTURL'])
-        content = self.getLink(imgurID)
+    IMGUR_IMAGE_DOMAIN = "https://i.imgur.com/"

-        if not os.path.exists(directory): os.makedirs(directory)
+    def __init__(self,directory, post):

-        if content['type'] == 'image':
+        link = post['CONTENTURL']

-            try:
-                post['MEDIAURL'] = content['object'].mp4
-            except AttributeError:
-                post['MEDIAURL'] = content['object'].link
+        if link.endswith(".gifv"):
+            link = link.replace(".gifv",".mp4")
+            Direct(directory, {**post, 'CONTENTURL': link})
+            return None

-            post['EXTENSION'] = getExtension(post['MEDIAURL'])
+        self.rawData = self.getData(link)

-            filename = GLOBAL.config['filename'].format(**post)+post["EXTENSION"]
-            shortFilename = post['POSTID']+post['EXTENSION']
-            
-            getFile(filename,shortFilename,directory,post['MEDIAURL'])
-
-        elif content['type'] == 'album':
-            images = content['object'].images
-            imagesLenght = len(images)
-            howManyDownloaded = imagesLenght
-            duplicates = 0
-
-            filename = GLOBAL.config['filename'].format(**post)
-
-            print(filename)
-
-            folderDir = directory / filename
-
-            try:
-                if not os.path.exists(folderDir):
-                    os.makedirs(folderDir)
-            except FileNotFoundError:
-                folderDir = directory / post['POSTID']
-                os.makedirs(folderDir)
-
-            for i in range(imagesLenght):
-                try:
-                    imageURL = images[i]['mp4']
-                except KeyError:
-                    imageURL = images[i]['link']
-
-                images[i]['Ext'] = getExtension(imageURL)
-
-                filename = (str(i+1)
-                            + "_"
-                            + nameCorrector(str(images[i]['title']))
-                            + "_"
-                            + images[i]['id'])
-
-                shortFilename = (str(i+1) + "_" + images[i]['id'])
-
-                print("\n  ({}/{})".format(i+1,imagesLenght))
-
-                try:
-                    getFile(filename,shortFilename,folderDir,imageURL,indent=2)
-                    print()
-                except FileAlreadyExistsError:
-                    print("  The file already exists" + " "*10,end="\n\n")
-                    duplicates += 1
-                    howManyDownloaded -= 1
-
-                except Exception as exception:
-                    print("\n  Could not get the file")
-                    print(
-                        "  "
-                        + "{class_name}: {info}".format(
-                            class_name=exception.__class__.__name__,
-                            info=str(exception)
-                        )
-                        + "\n"
-                    )
-                    howManyDownloaded -= 1
-
-            if duplicates == imagesLenght:
-                raise FileAlreadyExistsError
-            elif howManyDownloaded + duplicates < imagesLenght:
-                raise AlbumNotDownloadedCompletely(
-                    "Album Not Downloaded Completely"
-                )
-    
-    @staticmethod
-    def initImgur():
-        """Initialize imgur api"""
-
-        config = GLOBAL.config
-        return imgurpython.ImgurClient(
-            config["credentials"]['imgur_client_id'],
-            config["credentials"]['imgur_client_secret']
-        )
-    def getId(self,submissionURL):
-        """Extract imgur post id
-        and determine if its a single image or album
-        """
-
-        if submissionURL[-1] == "/":
-            submissionURL = submissionURL[:-1]
-
-        if "a/" in submissionURL or "gallery/" in submissionURL:
-            albumId = submissionURL.split("/")[-1]
-            return {'id':albumId, 'type':'album'}
+        self.directory = directory
+        self.post = post

+        if self.isAlbum:
+            if self.rawData["album_images"]["count"] != 1:
+                self.downloadAlbum(self.rawData["album_images"])
+            else:
+                self.download(self.rawData["album_images"]["images"][0])
        else:
-            url = submissionURL.replace('.','/').split('/')
-            imageId = url[url.index('com')+1]
-            return {'id':imageId, 'type':'image'}
+            self.download(self.rawData)

-    def getLink(self,identity):
-        """Request imgur object from imgur api
-        """
+    def downloadAlbum(self, images):
+        folderName = GLOBAL.config['filename'].format(**self.post)
+        folderDir = self.directory / folderName
+
+        imagesLenght = images["count"]
+        howManyDownloaded = 0
+        duplicates = 0
+
+        try:
+            if not os.path.exists(folderDir):
+                os.makedirs(folderDir)
+        except FileNotFoundError:
+            folderDir = self.directory / self.post['POSTID']
+            os.makedirs(folderDir)
+
+        print(folderName)
+
+        for i in range(imagesLenght):
+
+            extension = self.validateExtension(images["images"][i]["ext"])
+
+            imageURL = self.IMGUR_IMAGE_DOMAIN + images["images"][i]["hash"] + extension
+
+            filename = "_".join([
+                str(i+1), nameCorrector(images["images"][i]['title']), images["images"][i]['hash']
+            ]) + extension
+            shortFilename = str(i+1) + "_" + images["images"][i]['hash']
+
+            print("\n  ({}/{})".format(i+1,imagesLenght))
+
+            try:
+                getFile(filename,shortFilename,folderDir,imageURL,indent=2)
+                howManyDownloaded += 1
+                print()
+
+            except FileAlreadyExistsError:
+                print("  The file already exists" + " "*10,end="\n\n")
+                duplicates += 1
+
+            except TypeInSkip:
+                print("  Skipping...")
+                howManyDownloaded += 1
+
+            except Exception as exception:
+                print("\n  Could not get the file")
+                print(
+                    "  "
+                    + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
+                        class_name=exception.__class__.__name__,
+                        info=str(exception)
+                    )
+                    + "\n"
+                )
+                print(GLOBAL.log_stream.getvalue(),noPrint=True)
+
+        if duplicates == imagesLenght:
+            raise FileAlreadyExistsError
+        elif howManyDownloaded + duplicates < imagesLenght:
+            raise AlbumNotDownloadedCompletely(
+                "Album Not Downloaded Completely"
+            )           
+
+    def download(self, image):        
+        extension = self.validateExtension(image["ext"])
+        imageURL = self.IMGUR_IMAGE_DOMAIN + image["hash"] + extension
+
+        filename = GLOBAL.config['filename'].format(**self.post) + extension
+        shortFilename = self.post['POSTID']+extension
+        
+        getFile(filename,shortFilename,self.directory,imageURL)
+
+    @property
+    def isAlbum(self):
+        return "album_images" in self.rawData
+
+    @staticmethod 
+    def getData(link):
+        
+        cookies = {"over18": "1"}
+        res = requests.get(link, cookies=cookies)
+        if res.status_code != 200: raise ImageNotFound(f"Server responded with {res.status_code} to {link}")
+        pageSource = requests.get(link, cookies=cookies).text
+
+        STARTING_STRING = "image               : "
+        ENDING_STRING = "group               :"
+
+        STARTING_STRING_LENGHT = len(STARTING_STRING)
+        try:
+            startIndex = pageSource.index(STARTING_STRING) + STARTING_STRING_LENGHT
+            endIndex = pageSource.index(ENDING_STRING)
+        except ValueError:
+            raise NotADownloadableLinkError(f"Could not read the page source on {link}")
+
+        data = pageSource[startIndex:endIndex].strip()[:-1]
+
+        return json.loads(data)

-        if identity['type'] == 'image':
-            return {'object':self.imgurClient.get_image(identity['id']),
-                    'type':'image'}
-        elif identity['type'] == 'album':
-            return {'object':self.imgurClient.get_album(identity['id']),
-                    'type':'album'}
    @staticmethod
-    def get_credits():
-        return Imgur.initImgur().get_credits()
+    def validateExtension(string):
+        POSSIBLE_EXTENSIONS = [".jpg", ".png", ".mp4", ".gif"]
+
+        for extension in POSSIBLE_EXTENSIONS:
+            if extension in string: return extension
+        else: raise ExtensionError(f"\"{string}\" is not recognized as a valid extension.")
--- a/src/downloaders/downloaderUtils.py
+++ b/src/downloaders/downloaderUtils.py
@@ -8,7 +8,7 @@ import hashlib

 from src.utils import nameCorrector, GLOBAL
 from src.utils import printToFile as print
-from src.errors import FileAlreadyExistsError, FileNameTooLong, FailedToDownload, DomainInSkip
+from src.errors import FileAlreadyExistsError, FileNameTooLong, FailedToDownload, TypeInSkip, DomainInSkip

 def dlProgress(count, blockSize, totalSize):
    """Function for writing download progress to console
@@ -37,7 +37,18 @@ def getExtension(link):

 def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):

-    if any(domain in imageURL for domain in GLOBAL.arguments.skip):
+    FORMATS = {
+        "videos": [".mp4", ".webm"],
+        "images": [".jpg",".jpeg",".png",".bmp"],
+        "gifs": [".gif"]
+    }
+
+    for type in GLOBAL.arguments.skip:
+        for extension in FORMATS[type]:
+            if extension in filename:
+                raise TypeInSkip
+
+    if any(domain in imageURL for domain in GLOBAL.arguments.skip_domain):
        raise DomainInSkip

    headers = [
@@ -52,13 +63,13 @@ def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):
        ("Connection", "keep-alive")
    ]

+    if not os.path.exists(folderDir): os.makedirs(folderDir)
+
    opener = urllib.request.build_opener()
    if not "imgur" in imageURL:
        opener.addheaders = headers
    urllib.request.install_opener(opener)

-    filename = nameCorrector(filename)
-
    if not silent: print(" "*indent + str(folderDir),
                         " "*indent + str(filename),
                         sep="\n")
@@ -74,12 +85,12 @@ def getFile(filename,shortFilename,folderDir,imageURL,indent=0, silent=False):
                                           tempDir,
                                           reporthook=dlProgress)

+                fileHash = createHash(tempDir)
                if GLOBAL.arguments.no_dupes:
-                    fileHash = createHash(tempDir)
-                    if fileHash in GLOBAL.hashList:
+                    if fileHash in GLOBAL.downloadedPosts():
                        os.remove(tempDir)
                        raise FileAlreadyExistsError
-                    GLOBAL.hashList.add(fileHash)
+                GLOBAL.downloadedPosts.add(fileHash)

                os.rename(tempDir,fileDir)
                if not silent: print(" "*indent+"Downloaded"+" "*10)
--- a/src/downloaders/selfPost.py
+++ b/src/downloaders/selfPost.py
@@ -2,7 +2,7 @@ import io
 import os
 from pathlib import Path

-from src.errors import FileAlreadyExistsError
+from src.errors import FileAlreadyExistsError, TypeInSkip
 from src.utils import GLOBAL

 VanillaPrint = print
@@ -10,6 +10,9 @@ from src.utils import printToFile as print

 class SelfPost:
    def __init__(self,directory,post):
+
+        if "self" in GLOBAL.arguments.skip: raise TypeInSkip
+
        if not os.path.exists(directory): os.makedirs(directory)

        filename = GLOBAL.config['filename'].format(**post)
--- a/src/downloaders/youtube.py
+++ b/src/downloaders/youtube.py
@@ -36,10 +36,10 @@ class Youtube:
                fileHash = createHash(location)
            except FileNotFoundError:
                return None
-            if fileHash in GLOBAL.hashList:
+            if fileHash in GLOBAL.downloadedPosts():
                os.remove(location)
                raise FileAlreadyExistsError
-            GLOBAL.hashList.add(fileHash)
+            GLOBAL.downloadedPosts.add(fileHash)
        
    @staticmethod
    def _hook(d):
--- a/src/errors.py
+++ b/src/errors.py
@@ -99,5 +99,14 @@ class InvalidJSONFile(Exception):
 class FailedToDownload(Exception):
    pass

+class TypeInSkip(Exception):
+    pass
+
 class DomainInSkip(Exception):
+    pass
+
+class ImageNotFound(Exception):
+    pass
+
+class ExtensionError(Exception):
    pass
--- a/src/searcher.py
+++ b/src/searcher.py
@@ -201,7 +201,7 @@ def extractDetails(posts,SINGLE_POST=False):
    """

    postList = []
-    postCount = 0
+    postCount = 1

    allPosts = {}

@@ -227,18 +227,17 @@ def extractDetails(posts,SINGLE_POST=False):
        except AttributeError:
            pass

-        result = matchWithDownloader(submission)
+        if not any(domain in submission.domain for domain in GLOBAL.arguments.skip_domain):
+            result = matchWithDownloader(submission)

-        if result is not None:
-            details = {**details, **result}
-            postList.append(details)
-
-        postsFile.add({postCount:details})
+            if result is not None:
+                details = {**details, **result}
+                postList.append(details)
+                postsFile.add({postCount:details})

    else:
        try:
            for submission in posts:
-                postCount += 1

                if postCount % 100 == 0:
                    sys.stdout.write("• ")
@@ -264,13 +263,18 @@ def extractDetails(posts,SINGLE_POST=False):
                except AttributeError:
                    continue

-                result = matchWithDownloader(submission)
+                if details['POSTID'] in GLOBAL.downloadedPosts(): continue

-                if result is not None:
-                    details = {**details, **result}
-                    postList.append(details)
+                if not any(domain in submission.domain for domain in GLOBAL.arguments.skip_domain):
+                    result = matchWithDownloader(submission)

-                allPosts[postCount] = details
+                    if result is not None:
+                        details = {**details, **result}
+                        postList.append(details)
+                    
+                    allPosts[postCount] = details
+                    postCount += 1
+                
        except KeyboardInterrupt:
            print("\nKeyboardInterrupt",noPrint=True)
        
@@ -284,6 +288,11 @@ def extractDetails(posts,SINGLE_POST=False):

 def matchWithDownloader(submission):

+    directLink = extractDirectLink(submission.url)
+    if directLink:
+         return {'TYPE': 'direct',
+                 'CONTENTURL': directLink}
+
    if 'v.redd.it' in submission.domain:
        bitrates = ["DASH_1080","DASH_720","DASH_600", \
                    "DASH_480","DASH_360","DASH_240"]
@@ -291,7 +300,7 @@ def matchWithDownloader(submission):
        for bitrate in bitrates:
            videoURL = submission.url+"/"+bitrate

-            try:
+            try:    
                responseCode = urllib.request.urlopen(videoURL).getcode()
            except urllib.error.HTTPError:
                responseCode = 0
@@ -327,12 +336,6 @@ def matchWithDownloader(submission):
        return {'TYPE': 'self',
                'CONTENT': submission.selftext}

-    try:
-        return {'TYPE': 'direct',
-                'CONTENTURL': extractDirectLink(submission.url)}
-    except DirectLinkNotFound:
-        return None        
-
 def extractDirectLink(URL):
    """Check if link is a direct image link.
    If so, return URL,
@@ -346,26 +349,8 @@ def extractDirectLink(URL):
    if "i.reddituploads.com" in URL:
        return URL

-    elif "v.redd.it" in URL:
-        bitrates = ["DASH_1080","DASH_720","DASH_600", \
-                    "DASH_480","DASH_360","DASH_240"]
-                    
-        for bitrate in bitrates:
-            videoURL = URL+"/"+bitrate
-
-            try:
-                responseCode = urllib.request.urlopen(videoURL).getcode()
-            except urllib.error.HTTPError:
-                responseCode = 0
-
-            if responseCode == 200:
-                return videoURL
-
-        else:
-            raise DirectLinkNotFound
-
    for extension in imageTypes:
        if extension in URL.split("/")[-1]:
            return URL
    else:
-        raise DirectLinkNotFound
+        return  None
--- a/src/store.py
+++ b/src/store.py
@@ -17,8 +17,8 @@ class Store:
    def __call__(self):
        return self.list

-    def add(self, filehash):
-        self.list.append(filehash)
+    def add(self, data):
+        self.list.append(data)
        if self.directory:
            with open(self.directory, 'a') as f:
-                f.write("{filehash}\n".format(filehash=filehash))
+                f.write("{data}\n".format(data=data))
--- a/src/utils.py
+++ b/src/utils.py
@@ -18,10 +18,11 @@ class GLOBAL:
    configDirectory = ""
    reddit_client_id = "U-6gk4ZCh3IeNQ"
    reddit_client_secret = "7CZHY6AmKweZME5s50SfDGylaPg"
-    hashList = set()
    downloadedPosts = lambda: []
    printVanilla = print

+    log_stream= None
+
 def createLogFile(TITLE):
    """Create a log file with given name
    inside a folder time stampt in its name and
@@ -63,34 +64,32 @@ def printToFile(*args, noPrint=False,**kwargs):
        ) as FILE:
            print(*args, file=FILE, **kwargs) 

-def nameCorrector(string):
+def nameCorrector(string,reference=None):
    """Swap strange characters from given string 
    with underscore (_) and shorten it.
    Return the string
    """

-    stringLenght = len(string)
-    if stringLenght > 200:
-        string = string[:200]
-    stringLenght = len(string)
-    spacesRemoved = []
+    LIMIT = 247

-    for b in range(stringLenght):
-        if string[b] == " ":
-            spacesRemoved.append("_")
-        else:
-            spacesRemoved.append(string[b])
-    
-    string = ''.join(spacesRemoved)
+    stringLength = len(string)
+
+    if reference:
+        referenceLenght = len(reference)
+        totalLenght = referenceLenght
+    else:
+        totalLenght = stringLength
+
+    if totalLenght > LIMIT:
+        limit = LIMIT - referenceLenght
+        string = string[:limit-1]
+
+    string = string.replace(" ", "_")
    
    if len(string.split('\n')) > 1:
        string = "".join(string.split('\n'))
    
-    BAD_CHARS = ['\\','/',':','*','?','"','<','>','|','#']
-    
-    if any(x in string for x in BAD_CHARS):
-        for char in string:
-            if char in BAD_CHARS:
-                string = string.replace(char,"_")
+    BAD_CHARS = ['\\','/',':','*','?','"','<','>','|','#', '.', '@' ,'“', '’', '\'', '!']
+    string = "".join([i if i not in BAD_CHARS else "_" for i in string])

    return string