Move to different program structure

This commit is contained in:
Serene-Arc
2021-02-11 09:10:40 +10:00
committed by Ali Parlakci
parent a72abd6603
commit a7f1db14e5
24 changed files with 504 additions and 2133 deletions

View File

@@ -1,105 +1,46 @@
#!/usr/bin/env python3
# coding=utf-8
import hashlib
import logging
import re
from abc import ABC, abstractmethod
from pathlib import Path
import requests
from praw.models import Submission
from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.errors import FailedToDownload
from bulkredditdownloader.resource import Resource
logger = logging.getLogger(__name__)
class BaseDownloader(ABC):
def __init__(self, directory: Path, post: dict):
def __init__(self, directory: Path, post: Submission):
self.directory = directory
self.post = post
self.hashes = []
@abstractmethod
def download(self):
def download(self) -> list[Resource]:
raise NotImplementedError
@staticmethod
def _create_hash(content: bytes) -> str:
hash_md5 = hashlib.md5(content)
return hash_md5.hexdigest()
@staticmethod
def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
formats = {
"videos": [".mp4", ".webm"],
"images": [".jpg", ".jpeg", ".png", ".bmp"],
"gifs": [".gif"],
"self": []
def _download_resource(self, resource_url: str):
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive",
}
for file_type in GLOBAL.arguments.skip:
for extension in formats[file_type]:
if extension in filename:
raise TypeInSkip
if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
raise DomainInSkip
headers = [
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
("Accept-Encoding", "none"),
("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive")
]
folder_dir.mkdir(exist_ok=True)
if "imgur" not in image_url:
addheaders = headers
else:
addheaders = None
if not silent:
logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")
# Loop to attempt download 3 times
for i in range(3):
file_path = Path(folder_dir) / filename
if file_path.is_file():
raise FileAlreadyExistsError
else:
try:
download_content = requests.get(image_url, headers=addheaders).content
except ConnectionResetError:
raise FailedToDownload
file_hash = BaseDownloader._create_hash(download_content)
if GLOBAL.arguments.no_dupes:
if file_hash in GLOBAL.downloadedPosts():
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
with open(file_path, 'wb') as file:
file.write(download_content)
if not silent:
logger.info(" " * indent + "Downloaded" + " " * 10)
return
try:
download_content = requests.get(resource_url, headers=headers).content
except ConnectionResetError:
raise FailedToDownload
return Resource(self.post, resource_url, download_content)
raise FailedToDownload
@staticmethod
def _get_extension(url: str) -> str:
pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))')
if results := re.search(pattern, url):
if len(results.groups()) > 1:
return results[0]
if "v.redd.it" not in url:
return '.jpg'
else:
return '.mp4'

View File

@@ -2,18 +2,14 @@
import pathlib
from praw.models import Submission
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.utils import GLOBAL
class Direct(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
self.post['EXTENSION'] = self._get_extension(self.post['CONTENTURL'])
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
self._download_resource(pathlib.Path(filename), self.directory, self.post['CONTENTURL'])
return [self._download_resource(self.post.url)]

View File

@@ -7,77 +7,39 @@ import urllib.error
import urllib.request
from html.parser import HTMLParser
from praw.models import Submission
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class Erome(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
try:
images = self._get_links(self.post['CONTENTURL'])
images = self._get_links(self.post.url)
except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link")
images_length = len(images)
how_many_downloaded = len(images)
duplicates = 0
if images_length == 1:
"""Filenames are declared here"""
filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
if len(images) == 1:
image = images[0]
if not re.match(r'https?://.*', image):
image = "https://" + image
self._download_resource(filename, self.directory, image)
return [self._download_resource(image)]
else:
filename = GLOBAL.config['filename'].format(**self.post)
logger.info(filename)
folder_dir = self.directory / filename
folder_dir.mkdir(exist_ok=True)
out = []
for i, image in enumerate(images):
extension = self._get_extension(image)
filename = str(i + 1) + extension
if not re.match(r'https?://.*', image):
image = "https://" + image
logger.info(" ({}/{})".format(i + 1, images_length))
logger.info(" {}".format(filename))
try:
self._download_resource(pathlib.Path(filename), folder_dir, image, indent=2)
except FileAlreadyExistsError:
logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
how_many_downloaded -= 1
except Exception as exception:
# raise exception
logger.error("\n Could not get the file")
logger.error(
" "
+ "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))
+ "\n"
)
how_many_downloaded -= 1
if duplicates == images_length:
raise FileAlreadyExistsError
elif how_many_downloaded + duplicates < images_length:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
out.append(self._download_resource(image))
return out
@staticmethod
def _get_links(url: str) -> list[str]:

View File

@@ -1,26 +1,23 @@
#!/usr/bin/env python3
import json
import pathlib
import logging
import urllib.parse
import pathlib
import requests
from praw.models import Submission
from bulkredditdownloader.errors import ImageNotFound, NotADownloadableLinkError
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound,
NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class Gallery(BaseDownloader):
def __init__(self, directory: pathlib.Path, post):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
link = self.post['CONTENTURL']
link = self.post.url
self.raw_data = self._get_data(link)
self.download()
def download(self):
images = {}
@@ -37,7 +34,7 @@ class Gallery(BaseDownloader):
except KeyError:
continue
self._download_album(images, count)
return [self._download_album(images)]
@staticmethod
def _get_data(link: str) -> dict:
@@ -63,44 +60,9 @@ class Gallery(BaseDownloader):
data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1])
return data
def _download_album(self, images: dict, count: int):
folder_name = GLOBAL.config['filename'].format(**self.post)
folder_dir = self.directory / folder_name
how_many_downloaded = 0
duplicates = 0
folder_dir.mkdir(exist_ok=True)
logger.info(folder_name)
def _download_album(self, images: dict):
out = []
for i, image in enumerate(images):
path = urllib.parse.urlparse(image['url']).path
extension = pathlib.Path(path).suffix
out.append(self._download_resource(image['url']))
filename = pathlib.Path("_".join([str(i + 1), image['id']]) + extension)
logger.info("\n ({}/{})".format(i + 1, count))
try:
self._download_resource(filename, folder_dir, image['url'], indent=2)
how_many_downloaded += 1
except FileAlreadyExistsError:
logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
except TypeInSkip:
logger.info(" Skipping...")
how_many_downloaded += 1
except Exception as exception:
logger.info("\n Could not get the file")
logger.info(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__, info=str(exception)) + "\n"
)
logger.info(GLOBAL.log_stream.getvalue(), no_print=True)
if duplicates == count:
raise FileAlreadyExistsError
elif how_many_downloaded + duplicates < count:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
return out

View File

@@ -6,14 +6,14 @@ import re
import urllib.request
from bs4 import BeautifulSoup
from praw.models import Submission
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
class Gfycat(GifDeliveryNetwork):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
super().download()

View File

@@ -4,29 +4,23 @@ import pathlib
import urllib.request
from bs4 import BeautifulSoup
from praw.models import Submission
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
class GifDeliveryNetwork(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
try:
self.post['MEDIAURL'] = self._get_link(self.post['CONTENTURL'])
media_url = self._get_link(self.post.url)
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
self.post['EXTENSION'] = self._get_extension(self.post['MEDIAURL'])
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
self._download_resource(filename, self.directory, self.post['MEDIAURL'])
return [self._download_resource(media_url)]
@staticmethod
def _get_link(url: str) -> str:

View File

@@ -1,16 +1,15 @@
#!/usr/bin/env python3
import json
import pathlib
import logging
import pathlib
import requests
from praw.models import Submission
from bulkredditdownloader.errors import ExtensionError, ImageNotFound, NotADownloadableLinkError
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.site_downloaders.direct import Direct
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError,
ImageNotFound, NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL, nameCorrector
logger = logging.getLogger(__name__)
@@ -19,85 +18,43 @@ class Imgur(BaseDownloader):
imgur_image_domain = "https://i.imgur.com/"
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.raw_data = {}
self.download()
def download(self):
link = self.post['CONTENTURL']
link = self.post.url
if link.endswith(".gifv"):
link = link.replace(".gifv", ".mp4")
Direct(self.directory, {**self.post, 'CONTENTURL': link})
return
direct_thing = Direct(self.directory, self.post)
return direct_thing.download()
self.raw_data = self._get_data(link)
if self._is_album:
if self.raw_data["album_images"]["count"] != 1:
self._download_album(self.raw_data["album_images"])
out = self._download_album(self.raw_data["album_images"])
else:
self._download_image(self.raw_data["album_images"]["images"][0])
out = self._download_image(self.raw_data["album_images"]["images"][0])
else:
self._download_image(self.raw_data)
out = self._download_image(self.raw_data)
return out
def _download_album(self, images: dict):
folder_name = GLOBAL.config['filename'].format(**self.post)
folder_dir = self.directory / folder_name
images_length = images["count"]
how_many_downloaded = 0
duplicates = 0
folder_dir.mkdir(exist_ok=True)
logger.info(folder_name)
out = []
for i in range(images_length):
extension = self._validate_extension(images["images"][i]["ext"])
image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension
filename = pathlib.Path("_".join([str(i + 1),
nameCorrector(images["images"][i]['title']),
images["images"][i]['hash']]) + extension)
logger.info("\n ({}/{})".format(i + 1, images_length))
try:
self._download_resource(filename, folder_dir, image_url, indent=2)
how_many_downloaded += 1
except FileAlreadyExistsError:
logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
except TypeInSkip:
logger.info(" Skipping...")
how_many_downloaded += 1
except Exception as exception:
logger.info("\n Could not get the file")
logger.info(
" "
+ "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__,
info=str(exception)
)
+ "\n"
)
logger.info(GLOBAL.log_stream.getvalue(), no_print=True)
if duplicates == images_length:
raise FileAlreadyExistsError
elif how_many_downloaded + duplicates < images_length:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
out.append(self._download_resource(image_url))
return out
def _download_image(self, image: dict):
extension = self._validate_extension(image["ext"])
image_url = self.imgur_image_domain + image["hash"] + extension
filename = GLOBAL.config['filename'].format(**self.post) + extension
self._download_resource(filename, self.directory, image_url)
return [self._download_resource(image_url)]
def _is_album(self) -> bool:
return "album_images" in self.raw_data
@@ -134,9 +91,8 @@ class Imgur(BaseDownloader):
@staticmethod
def _validate_extension(extension_suffix: str) -> str:
possible_extensions = [".jpg", ".png", ".mp4", ".gif"]
for extension in possible_extensions:
if extension in extension_suffix:
return extension
else:
raise ExtensionError(f"\"{extension_suffix}\" is not recognized as a valid extension.")
raise ExtensionError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')

View File

@@ -5,24 +5,22 @@ import pathlib
import urllib.request
from bs4 import BeautifulSoup
from praw.models import Submission
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
class Redgifs(GifDeliveryNetwork):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
super().download()
@staticmethod
def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source
and return it
"""
"""Extract direct link to the video from page's source and return it"""
if '.webm' in url or '.mp4' in url or '.gif' in url:
return url

View File

@@ -1,64 +1,39 @@
#!/usr/bin/env python3
import io
import logging
import pathlib
from pathlib import Path
from praw.models import Submission
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class SelfPost(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
if "self" in GLOBAL.arguments.skip:
raise TypeInSkip
return Resource(self.post, self.post.url, bytes(self.export_to_string()))
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post)
file_dir = self.directory / (filename + ".md")
logger.info(file_dir)
logger.info(filename + ".md")
if Path.is_file(file_dir):
raise FileAlreadyExistsError
try:
self._write_to_file(file_dir, self.post)
except FileNotFoundError:
file_dir = self.post['POSTID'] + ".md"
file_dir = self.directory / file_dir
self._write_to_file(file_dir, self.post)
@staticmethod
def _write_to_file(directory: pathlib.Path, post: dict):
def export_to_string(self) -> str:
"""Self posts are formatted here"""
content = ("## ["
+ post["TITLE"]
+ self.post.fullname
+ "]("
+ post["CONTENTURL"]
+ self.post.url
+ ")\n"
+ post["CONTENT"]
+ self.post.selftext
+ "\n\n---\n\n"
+ "submitted to [r/"
+ post["SUBREDDIT"]
+ self.post.subreddit.title
+ "](https://www.reddit.com/r/"
+ post["SUBREDDIT"]
+ self.post.subreddit.title
+ ") by [u/"
+ post["REDDITOR"]
+ self.post.author.name
+ "](https://www.reddit.com/user/"
+ post["REDDITOR"]
+ self.post.author.name
+ ")")
with io.open(directory, "w", encoding="utf-8") as FILE:
print(content, file=FILE)
logger.info("Downloaded")
return content

View File

@@ -4,61 +4,49 @@ import logging
import os
import pathlib
import subprocess
import tempfile
import requests
from praw.models import Submission
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class VReddit(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
extension = ".mp4"
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post) + extension
try:
fnull = open(os.devnull, 'w')
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
except Exception:
self._download_resource(filename, self.directory, self.post['CONTENTURL'])
logger.info("FFMPEG library not found, skipping merging video and audio")
except subprocess.SubprocessError:
return self._download_resource(self.post.url)
else:
video_name = self.post['POSTID'] + "_video"
video_url = self.post['CONTENTURL']
audio_name = self.post['POSTID'] + "_audio"
video_url = self.post.url
audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4'
logger.info(self.directory, filename, sep="\n")
self._download_resource(video_name, self.directory, video_url, silent=True)
self._download_resource(audio_name, self.directory, audio_url, silent=True)
try:
self._merge_audio(video_name, audio_name, filename, self.directory)
except KeyboardInterrupt:
(self.directory / filename).unlink()
(self.directory / audio_name).unlink()
(self.directory / video_name).unlink()
(self.directory / filename).unlink()
with tempfile.TemporaryDirectory() as temp_dir:
video = requests.get(video_url).content
audio = requests.get(audio_url).content
with open(temp_dir / 'video', 'wb')as file:
file.write(video)
with open(temp_dir / 'audio', 'wb') as file:
file.write(audio)
self._merge_audio(temp_dir)
with open(temp_dir / 'output.mp4', 'rb') as file:
content = file.read()
return Resource(self.post, self.post.url, content)
@staticmethod
def _merge_audio(
video: pathlib.Path,
audio: pathlib.Path,
filename: pathlib.Path,
directory: pathlib.Path):
input_video = str(directory / video)
input_audio = str(directory / audio)
def _merge_audio(working_directory: pathlib.Path):
input_video = working_directory / 'video'
input_audio = working_directory / 'audio'
fnull = open(os.devnull, 'w')
cmd = "ffmpeg -i {} -i {} -c:v copy -c:a aac -strict experimental {}".format(
input_audio, input_video, str(directory / filename))
input_audio, input_video, str(working_directory / 'output.mp4'))
subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT)
(directory / video).unlink()
(directory / audio).unlink()

View File

@@ -1,64 +1,37 @@
#!/usr/bin/env python3
import logging
import os
import pathlib
import sys
import tempfile
import youtube_dl
from praw.models import Submission
from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class Youtube(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
def __init__(self, directory: pathlib.Path, post: Submission):
super().__init__(directory, post)
self.download()
def download(self):
self.directory.mkdir(exist_ok=True)
return self._download_video()
filename = GLOBAL.config['filename'].format(**self.post)
logger.info(filename)
def _download_video(self) -> Resource:
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "best",
"outtmpl": str(temp_dir / "test.%(ext)s"),
"playlistend": 1,
"nooverwrites": True,
"quiet": True
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([self.post.url])
self._download_video(filename, self.directory, self.post['CONTENTURL'])
def _download_video(self, filename: str, directory: pathlib.Path, url: str):
ydl_opts = {
"format": "best",
"outtmpl": str(directory / (filename + ".%(ext)s")),
"progress_hooks": [self._hook],
"playlistend": 1,
"nooverwrites": True,
"quiet": True
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
location = directory / (filename + ".mp4")
with open(location, 'rb') as file:
content = file.read()
if GLOBAL.arguments.no_dupes:
try:
file_hash = self._create_hash(content)
except FileNotFoundError:
return None
if file_hash in GLOBAL.downloadedPosts():
os.remove(location)
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
@staticmethod
def _hook(d):
if d['status'] == 'finished':
return logger.info("Downloaded")
downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6)))
file_size = int(d['total_bytes'] * (10**(-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size))
sys.stdout.flush()
with open(temp_dir / 'test.mp4', 'rb') as file:
content = file.read()
return Resource(self.post, self.post.url, content)