Rename folder

This commit is contained in:
Serene-Arc
2021-02-07 17:08:24 +10:00
committed by Ali Parlakci
parent 5ef58f147f
commit f2415b6bd0
14 changed files with 22 additions and 22 deletions

View File

@@ -0,0 +1,105 @@
#!/usr/bin/env python3
# coding=utf-8
import hashlib
import logging
import re
from abc import ABC, abstractmethod
from pathlib import Path
import requests
from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class BaseDownloader(ABC):
def __init__(self, directory: Path, post: dict):
self.directory = directory
self.post = post
@abstractmethod
def download(self):
raise NotImplementedError
@staticmethod
def _create_hash(content: bytes) -> str:
hash_md5 = hashlib.md5(content)
return hash_md5.hexdigest()
@staticmethod
def _download_resource(filename: Path, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
formats = {
"videos": [".mp4", ".webm"],
"images": [".jpg", ".jpeg", ".png", ".bmp"],
"gifs": [".gif"],
"self": []
}
for file_type in GLOBAL.arguments.skip:
for extension in formats[file_type]:
if extension in filename:
raise TypeInSkip
if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
raise DomainInSkip
headers = [
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
"Safari/537.36 OPR/54.0.2952.64"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
("Accept-Encoding", "none"),
("Accept-Language", "en-US,en;q=0.8"),
("Connection", "keep-alive")
]
folder_dir.mkdir(exist_ok=True)
if "imgur" not in image_url:
addheaders = headers
else:
addheaders = None
if not silent:
logger.info(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")
# Loop to attempt download 3 times
for i in range(3):
file_path = Path(folder_dir) / filename
if file_path.is_file():
raise FileAlreadyExistsError
else:
try:
download_content = requests.get(image_url, headers=addheaders).content
except ConnectionResetError:
raise FailedToDownload
file_hash = BaseDownloader._create_hash(download_content)
if GLOBAL.arguments.no_dupes:
if file_hash in GLOBAL.downloadedPosts():
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
with open(file_path, 'wb') as file:
file.write(download_content)
if not silent:
logger.info(" " * indent + "Downloaded" + " " * 10)
return
raise FailedToDownload
@staticmethod
def _get_extension(url: str) -> str:
pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))')
if results := re.search(pattern, url):
if len(results.groups()) > 1:
return results[0]
if "v.redd.it" not in url:
return '.jpg'
else:
return '.mp4'

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env python3
import pathlib
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.utils import GLOBAL
class Direct(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
self.post['EXTENSION'] = self._get_extension(self.post['CONTENTURL'])
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
self._download_resource(pathlib.Path(filename), self.directory, self.post['CONTENTURL'])

View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
import logging
import pathlib
import re
import urllib.error
import urllib.request
from html.parser import HTMLParser
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class Erome(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
try:
images = self._get_links(self.post['CONTENTURL'])
except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link")
images_length = len(images)
how_many_downloaded = len(images)
duplicates = 0
if images_length == 1:
"""Filenames are declared here"""
filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
image = images[0]
if not re.match(r'https?://.*', image):
image = "https://" + image
self._download_resource(filename, self.directory, image)
else:
filename = GLOBAL.config['filename'].format(**self.post)
logger.info(filename)
folder_dir = self.directory / filename
folder_dir.mkdir(exist_ok=True)
for i, image in enumerate(images):
extension = self._get_extension(image)
filename = str(i + 1) + extension
if not re.match(r'https?://.*', image):
image = "https://" + image
logger.info(" ({}/{})".format(i + 1, images_length))
logger.info(" {}".format(filename))
try:
self._download_resource(pathlib.Path(filename), folder_dir, image, indent=2)
except FileAlreadyExistsError:
logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
how_many_downloaded -= 1
except Exception as exception:
# raise exception
logger.error("\n Could not get the file")
logger.error(
" "
+ "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))
+ "\n"
)
how_many_downloaded -= 1
if duplicates == images_length:
raise FileAlreadyExistsError
elif how_many_downloaded + duplicates < images_length:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
@staticmethod
def _get_links(url: str) -> list[str]:
content = []
line_number = None
# TODO: move to bs4 and requests
class EromeParser(HTMLParser):
tag = None
def handle_starttag(self, tag, attrs):
self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}
page_source = (urllib.request.urlopen(url).read().decode().split('\n'))
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
for i in range(len(page_source)):
obj = EromeParser()
obj.feed(page_source[i])
tag = obj.tag
if tag is not None:
if "div" in tag:
if "id" in tag["div"]:
if tag["div"]["id"] == "album":
line_number = i
break
for line in page_source[line_number:]:
obj = EromeParser()
obj.feed(line)
tag = obj.tag
if tag is not None:
if "img" in tag:
if "class" in tag["img"]:
if tag["img"]["class"] == "img-front":
content.append(tag["img"]["src"])
elif "source" in tag:
content.append(tag["source"]["src"])
return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]

View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python3
import json
import pathlib
import logging
import urllib.parse
import requests
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound,
NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class Gallery(BaseDownloader):
def __init__(self, directory: pathlib.Path, post):
super().__init__(directory, post)
link = self.post['CONTENTURL']
self.raw_data = self._get_data(link)
self.download()
def download(self):
images = {}
count = 0
for model in self.raw_data['posts']['models']:
try:
for item in self.raw_data['posts']['models'][model]['media']['gallery']['items']:
try:
images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts']
['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']}
count += 1
except KeyError:
continue
except KeyError:
continue
self._download_album(images, count)
@staticmethod
def _get_data(link: str) -> dict:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
}
res = requests.get(link, headers=headers)
if res.status_code != 200:
raise ImageNotFound(f"Server responded with {res.status_code} to {link}")
page_source = res.text
starting_string = "_r = {"
ending_string = "</script>"
starting_string_lenght = len(starting_string)
try:
start_index = page_source.index(starting_string) + starting_string_lenght
end_index = page_source.index(ending_string, start_index)
except ValueError:
raise NotADownloadableLinkError(f"Could not read the page source on {link}")
data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1])
return data
def _download_album(self, images: dict, count: int):
folder_name = GLOBAL.config['filename'].format(**self.post)
folder_dir = self.directory / folder_name
how_many_downloaded = 0
duplicates = 0
folder_dir.mkdir(exist_ok=True)
logger.info(folder_name)
for i, image in enumerate(images):
path = urllib.parse.urlparse(image['url']).path
extension = pathlib.Path(path).suffix
filename = pathlib.Path("_".join([str(i + 1), image['id']]) + extension)
logger.info("\n ({}/{})".format(i + 1, count))
try:
self._download_resource(filename, folder_dir, image['url'], indent=2)
how_many_downloaded += 1
except FileAlreadyExistsError:
logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
except TypeInSkip:
logger.info(" Skipping...")
how_many_downloaded += 1
except Exception as exception:
logger.info("\n Could not get the file")
logger.info(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__, info=str(exception)) + "\n"
)
logger.info(GLOBAL.log_stream.getvalue(), no_print=True)
if duplicates == count:
raise FileAlreadyExistsError
elif how_many_downloaded + duplicates < count:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import json
import pathlib
import re
import urllib.request
from bs4 import BeautifulSoup
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
class Gfycat(GifDeliveryNetwork):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
super().download()
@staticmethod
def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source
and return it
"""
if re.match(r'\.(webm|mp4|gif)$', url):
return url
if url.endswith('/'):
url = url[:-1]
url = "https://gfycat.com/" + url.split('/')[-1]
page_source = (urllib.request.urlopen(url).read().decode())
soup = BeautifulSoup(page_source, "html.parser")
attributes = {"data-react-helmet": "true", "type": "application/ld+json"}
content = soup.find("script", attrs=attributes)
if content is None:
return super()._get_link(url)
return json.loads(content.contents[0])["video"]["contentUrl"]

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python3
import pathlib
import urllib.request
from bs4 import BeautifulSoup
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import NotADownloadableLinkError
from bulkredditdownloader.utils import GLOBAL
class GifDeliveryNetwork(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
try:
self.post['MEDIAURL'] = self._get_link(self.post['CONTENTURL'])
except IndexError:
raise NotADownloadableLinkError("Could not read the page source")
self.post['EXTENSION'] = self._get_extension(self.post['MEDIAURL'])
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post) + self.post["EXTENSION"]
self._download_resource(filename, self.directory, self.post['MEDIAURL'])
@staticmethod
def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source
and return it
"""
if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]:
return url
if url[-1:] == '/':
url = url[:-1]
url = "https://www.gifdeliverynetwork.com/" + url.split('/')[-1]
page_source = (urllib.request.urlopen(url).read().decode())
soup = BeautifulSoup(page_source, "html.parser")
attributes = {"id": "mp4Source", "type": "video/mp4"}
content = soup.find("source", attrs=attributes)
if content is None:
raise NotADownloadableLinkError("Could not read the page source")
return content["src"]

View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
import json
import pathlib
import logging
import requests
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.site_downloaders.direct import Direct
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError,
ImageNotFound, NotADownloadableLinkError, TypeInSkip)
from bulkredditdownloader.utils import GLOBAL, nameCorrector
logger = logging.getLogger(__name__)
class Imgur(BaseDownloader):
imgur_image_domain = "https://i.imgur.com/"
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.raw_data = {}
self.download()
def download(self):
link = self.post['CONTENTURL']
if link.endswith(".gifv"):
link = link.replace(".gifv", ".mp4")
Direct(self.directory, {**self.post, 'CONTENTURL': link})
return
self.raw_data = self._get_data(link)
if self._is_album:
if self.raw_data["album_images"]["count"] != 1:
self._download_album(self.raw_data["album_images"])
else:
self._download_image(self.raw_data["album_images"]["images"][0])
else:
self._download_image(self.raw_data)
def _download_album(self, images: dict):
folder_name = GLOBAL.config['filename'].format(**self.post)
folder_dir = self.directory / folder_name
images_length = images["count"]
how_many_downloaded = 0
duplicates = 0
folder_dir.mkdir(exist_ok=True)
logger.info(folder_name)
for i in range(images_length):
extension = self._validate_extension(images["images"][i]["ext"])
image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension
filename = pathlib.Path("_".join([str(i + 1),
nameCorrector(images["images"][i]['title']),
images["images"][i]['hash']]) + extension)
logger.info("\n ({}/{})".format(i + 1, images_length))
try:
self._download_resource(filename, folder_dir, image_url, indent=2)
how_many_downloaded += 1
except FileAlreadyExistsError:
logger.info(" The file already exists" + " " * 10, end="\n\n")
duplicates += 1
except TypeInSkip:
logger.info(" Skipping...")
how_many_downloaded += 1
except Exception as exception:
logger.info("\n Could not get the file")
logger.info(
" "
+ "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
class_name=exception.__class__.__name__,
info=str(exception)
)
+ "\n"
)
logger.info(GLOBAL.log_stream.getvalue(), no_print=True)
if duplicates == images_length:
raise FileAlreadyExistsError
elif how_many_downloaded + duplicates < images_length:
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
def _download_image(self, image: dict):
extension = self._validate_extension(image["ext"])
image_url = self.imgur_image_domain + image["hash"] + extension
filename = GLOBAL.config['filename'].format(**self.post) + extension
self._download_resource(filename, self.directory, image_url)
def _is_album(self) -> bool:
return "album_images" in self.raw_data
@staticmethod
def _get_data(link: str) -> dict:
cookies = {"over18": "1", "postpagebeta": "0"}
res = requests.get(link, cookies=cookies)
if res.status_code != 200:
raise ImageNotFound(f"Server responded with {res.status_code} to {link}")
page_source = requests.get(link, cookies=cookies).text
starting_string = "image : "
ending_string = "group :"
starting_string_lenght = len(starting_string)
try:
start_index = page_source.index(starting_string) + starting_string_lenght
end_index = page_source.index(ending_string, start_index)
except ValueError:
raise NotADownloadableLinkError(
f"Could not read the page source on {link}")
while page_source[end_index] != "}":
end_index -= 1
try:
data = page_source[start_index:end_index + 2].strip()[:-1]
except IndexError:
page_source[end_index + 1] = '}'
data = page_source[start_index:end_index + 3].strip()[:-1]
return json.loads(data)
@staticmethod
def _validate_extension(extension_suffix: str) -> str:
possible_extensions = [".jpg", ".png", ".mp4", ".gif"]
for extension in possible_extensions:
if extension in extension_suffix:
return extension
else:
raise ExtensionError(f"\"{extension_suffix}\" is not recognized as a valid extension.")

View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python3
import json
import pathlib
import urllib.request
from bs4 import BeautifulSoup
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
from bulkredditdownloader.errors import NotADownloadableLinkError
class Redgifs(GifDeliveryNetwork):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
super().download()
@staticmethod
def _get_link(url: str) -> str:
"""Extract direct link to the video from page's source
and return it
"""
if '.webm' in url or '.mp4' in url or '.gif' in url:
return url
if url[-1:] == '/':
url = url[:-1]
url = urllib.request.Request(
"https://redgifs.com/watch/" + url.split('/')[-1])
url.add_header(
'User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
page_source = (urllib.request.urlopen(url).read().decode())
soup = BeautifulSoup(page_source, "html.parser")
attributes = {"data-react-helmet": "true", "type": "application/ld+json"}
content = soup.find("script", attrs=attributes)
if content is None:
raise NotADownloadableLinkError("Could not read the page source")
return json.loads(content.contents[0])["video"]["contentUrl"]

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python3
import io
import logging
import pathlib
from pathlib import Path
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class SelfPost(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
if "self" in GLOBAL.arguments.skip:
raise TypeInSkip
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post)
file_dir = self.directory / (filename + ".md")
logger.info(file_dir)
logger.info(filename + ".md")
if Path.is_file(file_dir):
raise FileAlreadyExistsError
try:
self._write_to_file(file_dir, self.post)
except FileNotFoundError:
file_dir = self.post['POSTID'] + ".md"
file_dir = self.directory / file_dir
self._write_to_file(file_dir, self.post)
@staticmethod
def _write_to_file(directory: pathlib.Path, post: dict):
"""Self posts are formatted here"""
content = ("## ["
+ post["TITLE"]
+ "]("
+ post["CONTENTURL"]
+ ")\n"
+ post["CONTENT"]
+ "\n\n---\n\n"
+ "submitted to [r/"
+ post["SUBREDDIT"]
+ "](https://www.reddit.com/r/"
+ post["SUBREDDIT"]
+ ") by [u/"
+ post["REDDITOR"]
+ "](https://www.reddit.com/user/"
+ post["REDDITOR"]
+ ")")
with io.open(directory, "w", encoding="utf-8") as FILE:
print(content, file=FILE)
logger.info("Downloaded")

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python3
import logging
import os
import pathlib
import subprocess
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class VReddit(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
extension = ".mp4"
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post) + extension
try:
fnull = open(os.devnull, 'w')
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
except Exception:
self._download_resource(filename, self.directory, self.post['CONTENTURL'])
logger.info("FFMPEG library not found, skipping merging video and audio")
else:
video_name = self.post['POSTID'] + "_video"
video_url = self.post['CONTENTURL']
audio_name = self.post['POSTID'] + "_audio"
audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4'
logger.info(self.directory, filename, sep="\n")
self._download_resource(video_name, self.directory, video_url, silent=True)
self._download_resource(audio_name, self.directory, audio_url, silent=True)
try:
self._merge_audio(video_name, audio_name, filename, self.directory)
except KeyboardInterrupt:
(self.directory / filename).unlink()
(self.directory / audio_name).unlink()
(self.directory / video_name).unlink()
(self.directory / filename).unlink()
@staticmethod
def _merge_audio(
video: pathlib.Path,
audio: pathlib.Path,
filename: pathlib.Path,
directory: pathlib.Path):
input_video = str(directory / video)
input_audio = str(directory / audio)
fnull = open(os.devnull, 'w')
cmd = "ffmpeg -i {} -i {} -c:v copy -c:a aac -strict experimental {}".format(
input_audio, input_video, str(directory / filename))
subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT)
(directory / video).unlink()
(directory / audio).unlink()

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python3
import logging
import os
import pathlib
import sys
import youtube_dl
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
from bulkredditdownloader.errors import FileAlreadyExistsError
from bulkredditdownloader.utils import GLOBAL
logger = logging.getLogger(__name__)
class Youtube(BaseDownloader):
def __init__(self, directory: pathlib.Path, post: dict):
super().__init__(directory, post)
self.download()
def download(self):
self.directory.mkdir(exist_ok=True)
filename = GLOBAL.config['filename'].format(**self.post)
logger.info(filename)
self._download_video(filename, self.directory, self.post['CONTENTURL'])
def _download_video(self, filename: str, directory: pathlib.Path, url: str):
ydl_opts = {
"format": "best",
"outtmpl": str(directory / (filename + ".%(ext)s")),
"progress_hooks": [self._hook],
"playlistend": 1,
"nooverwrites": True,
"quiet": True
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
location = directory / (filename + ".mp4")
with open(location, 'rb') as file:
content = file.read()
if GLOBAL.arguments.no_dupes:
try:
file_hash = self._create_hash(content)
except FileNotFoundError:
return None
if file_hash in GLOBAL.downloadedPosts():
os.remove(location)
raise FileAlreadyExistsError
GLOBAL.downloadedPosts.add(file_hash)
@staticmethod
def _hook(d):
if d['status'] == 'finished':
return logger.info("Downloaded")
downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6)))
file_size = int(d['total_bytes'] * (10**(-6)))
sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size))
sys.stdout.flush()