BDFR_Web/bulkredditdownloader/downloaders/Erome.py

import os
import pathlib
import urllib.error
import urllib.request
from html.parser import HTMLParser

from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile
from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.utils import printToFile as print


class Erome:
    def __init__(self, directory: pathlib.Path, post: dict):
        try:
            images = self.getLinks(post['CONTENTURL'])
        except urllib.error.HTTPError:
            raise NotADownloadableLinkError("Not a downloadable link")

        images_length = len(images)
        how_many_downloaded = images_length
        duplicates = 0

        if images_length == 1:
            extension = getExtension(images[0])

            """Filenames are declared here"""
            filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
            short_filename = post['POSTID'] + extension

            image_url = images[0]
            if 'https://' not in image_url or 'http://' not in image_url:
                image_url = "https://" + image_url

            getFile(filename, short_filename, directory, image_url)

        else:
            filename = GLOBAL.config['filename'].format(**post)
            print(filename)

            folder_dir = directory / filename

            try:
                if not os.path.exists(folder_dir):
                    os.makedirs(folder_dir)
            except FileNotFoundError:
                folder_dir = directory / post['POSTID']
                os.makedirs(folder_dir)

            for i in range(images_length):
                extension = getExtension(images[i])

                filename = str(i + 1) + extension
                image_url = images[i]
                if 'https://' not in image_url and 'http://' not in image_url:
                    image_url = "https://" + image_url

                print("  ({}/{})".format(i + 1, images_length))
                print("  {}".format(filename))

                try:
                    getFile(filename, filename, folder_dir, image_url, indent=2)
                    print()
                except FileAlreadyExistsError:
                    print("  The file already exists" + " " * 10, end="\n\n")
                    duplicates += 1
                    how_many_downloaded -= 1

                except Exception as exception:
                    # raise exception
                    print("\n  Could not get the file")
                    print(
                        "  "
                        + "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))
                        + "\n"
                    )
                    how_many_downloaded -= 1

            if duplicates == images_length:
                raise FileAlreadyExistsError
            elif how_many_downloaded + duplicates < images_length:
                raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")

    def getLinks(self, url: str) -> list[str]:
        content = []
        line_number = None

        class EromeParser(HTMLParser):
            tag = None

            def handle_starttag(self, tag, attrs):
                self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}

        page_source = (urllib.request.urlopen(url).read().decode().split('\n'))

        """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
        for i in range(len(page_source)):
            obj = EromeParser()
            obj.feed(page_source[i])
            tag = obj.tag

            if tag is not None:
                if "div" in tag:
                    if "id" in tag["div"]:
                        if tag["div"]["id"] == "album":
                            line_number = i
                            break

        for line in page_source[line_number:]:
            obj = EromeParser()
            obj.feed(line)
            tag = obj.tag
            if tag is not None:
                if "img" in tag:
                    if "class" in tag["img"]:
                        if tag["img"]["class"] == "img-front":
                            content.append(tag["img"]["src"])
                elif "source" in tag:
                    content.append(tag["source"]["src"])

        return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]