Move to standard module structure
This commit is contained in:
17
bulkredditdownloader/downloaders/Direct.py
Normal file
17
bulkredditdownloader/downloaders/Direct.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
|
||||
|
||||
class Direct:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
post['EXTENSION'] = getExtension(post['CONTENTURL'])
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
|
||||
short_filename = post['POSTID'] + post['EXTENSION']
|
||||
|
||||
getFile(filename, short_filename, directory, post['CONTENTURL'])
|
||||
121
bulkredditdownloader/downloaders/Erome.py
Normal file
121
bulkredditdownloader/downloaders/Erome.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import pathlib
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile
|
||||
from bulkredditdownloader.errors import AlbumNotDownloadedCompletely, FileAlreadyExistsError, NotADownloadableLinkError
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
from bulkredditdownloader.utils import printToFile as print
|
||||
|
||||
|
||||
class Erome:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
try:
|
||||
images = self.getLinks(post['CONTENTURL'])
|
||||
except urllib.error.HTTPError:
|
||||
raise NotADownloadableLinkError("Not a downloadable link")
|
||||
|
||||
images_length = len(images)
|
||||
how_many_downloaded = images_length
|
||||
duplicates = 0
|
||||
|
||||
if images_length == 1:
|
||||
extension = getExtension(images[0])
|
||||
|
||||
"""Filenames are declared here"""
|
||||
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
|
||||
short_filename = post['POSTID'] + extension
|
||||
|
||||
image_url = images[0]
|
||||
if 'https://' not in image_url or 'http://' not in image_url:
|
||||
image_url = "https://" + image_url
|
||||
|
||||
getFile(filename, short_filename, directory, image_url)
|
||||
|
||||
else:
|
||||
filename = GLOBAL.config['filename'].format(**post)
|
||||
print(filename)
|
||||
|
||||
folder_dir = directory / filename
|
||||
|
||||
try:
|
||||
if not os.path.exists(folder_dir):
|
||||
os.makedirs(folder_dir)
|
||||
except FileNotFoundError:
|
||||
folder_dir = directory / post['POSTID']
|
||||
os.makedirs(folder_dir)
|
||||
|
||||
for i in range(images_length):
|
||||
extension = getExtension(images[i])
|
||||
|
||||
filename = str(i + 1) + extension
|
||||
image_url = images[i]
|
||||
if 'https://' not in image_url and 'http://' not in image_url:
|
||||
image_url = "https://" + image_url
|
||||
|
||||
print(" ({}/{})".format(i + 1, images_length))
|
||||
print(" {}".format(filename))
|
||||
|
||||
try:
|
||||
getFile(filename, filename, folder_dir, image_url, indent=2)
|
||||
print()
|
||||
except FileAlreadyExistsError:
|
||||
print(" The file already exists" + " " * 10, end="\n\n")
|
||||
duplicates += 1
|
||||
how_many_downloaded -= 1
|
||||
|
||||
except Exception as exception:
|
||||
# raise exception
|
||||
print("\n Could not get the file")
|
||||
print(
|
||||
" "
|
||||
+ "{class_name}: {info}".format(class_name=exception.__class__.__name__, info=str(exception))
|
||||
+ "\n"
|
||||
)
|
||||
how_many_downloaded -= 1
|
||||
|
||||
if duplicates == images_length:
|
||||
raise FileAlreadyExistsError
|
||||
elif how_many_downloaded + duplicates < images_length:
|
||||
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
|
||||
|
||||
def getLinks(self, url: str) -> list[str]:
|
||||
content = []
|
||||
line_number = None
|
||||
|
||||
class EromeParser(HTMLParser):
|
||||
tag = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.tag = {tag: {attr[0]: attr[1] for attr in attrs}}
|
||||
|
||||
page_source = (urllib.request.urlopen(url).read().decode().split('\n'))
|
||||
|
||||
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
|
||||
for i in range(len(page_source)):
|
||||
obj = EromeParser()
|
||||
obj.feed(page_source[i])
|
||||
tag = obj.tag
|
||||
|
||||
if tag is not None:
|
||||
if "div" in tag:
|
||||
if "id" in tag["div"]:
|
||||
if tag["div"]["id"] == "album":
|
||||
line_number = i
|
||||
break
|
||||
|
||||
for line in page_source[line_number:]:
|
||||
obj = EromeParser()
|
||||
obj.feed(line)
|
||||
tag = obj.tag
|
||||
if tag is not None:
|
||||
if "img" in tag:
|
||||
if "class" in tag["img"]:
|
||||
if tag["img"]["class"] == "img-front":
|
||||
content.append(tag["img"]["src"])
|
||||
elif "source" in tag:
|
||||
content.append(tag["source"]["src"])
|
||||
|
||||
return [link for link in content if link.endswith("_480p.mp4") or not link.endswith(".mp4")]
|
||||
54
bulkredditdownloader/downloaders/Gfycat.py
Normal file
54
bulkredditdownloader/downloaders/Gfycat.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile
|
||||
from bulkredditdownloader.downloaders.gifDeliveryNetwork import GifDeliveryNetwork
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
import pathlib
|
||||
|
||||
|
||||
|
||||
class Gfycat:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
try:
|
||||
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
|
||||
except IndexError:
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
post['EXTENSION'] = getExtension(post['MEDIAURL'])
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
|
||||
short_filename = post['POSTID'] + post['EXTENSION']
|
||||
|
||||
getFile(filename, short_filename, directory, post['MEDIAURL'])
|
||||
|
||||
@staticmethod
|
||||
def getLink(url: str) -> str:
|
||||
"""Extract direct link to the video from page's source
|
||||
and return it
|
||||
"""
|
||||
if '.webm' in url or '.mp4' in url or '.gif' in url:
|
||||
return url
|
||||
|
||||
if url[-1:] == '/':
|
||||
url = url[:-1]
|
||||
|
||||
url = "https://gfycat.com/" + url.split('/')[-1]
|
||||
|
||||
page_source = (urllib.request.urlopen(url).read().decode())
|
||||
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
attributes = {"data-react-helmet": "true", "type": "application/ld+json"}
|
||||
content = soup.find("script", attrs=attributes)
|
||||
|
||||
if content is None:
|
||||
return GifDeliveryNetwork.getLink(url)
|
||||
|
||||
return json.loads(content.contents[0])["video"]["contentUrl"]
|
||||
147
bulkredditdownloader/downloaders/Imgur.py
Normal file
147
bulkredditdownloader/downloaders/Imgur.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import requests
|
||||
|
||||
from bulkredditdownloader.downloaders.Direct import Direct
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getFile
|
||||
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, ExtensionError, FileAlreadyExistsError, ImageNotFound,
|
||||
NotADownloadableLinkError, TypeInSkip)
|
||||
from bulkredditdownloader.utils import GLOBAL, nameCorrector
|
||||
from bulkredditdownloader.utils import printToFile as print
|
||||
|
||||
|
||||
class Imgur:
|
||||
|
||||
imgur_image_domain = "https://i.imgur.com/"
|
||||
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
link = post['CONTENTURL']
|
||||
|
||||
if link.endswith(".gifv"):
|
||||
link = link.replace(".gifv", ".mp4")
|
||||
Direct(directory, {**post, 'CONTENTURL': link})
|
||||
return
|
||||
|
||||
self.raw_data = self.getData(link)
|
||||
|
||||
self.directory = directory
|
||||
self.post = post
|
||||
|
||||
if self.isAlbum:
|
||||
if self.raw_data["album_images"]["count"] != 1:
|
||||
self.downloadAlbum(self.raw_data["album_images"])
|
||||
else:
|
||||
self.download(self.raw_data["album_images"]["images"][0])
|
||||
else:
|
||||
self.download(self.raw_data)
|
||||
|
||||
def downloadAlbum(self, images: dict):
|
||||
folder_name = GLOBAL.config['filename'].format(**self.post)
|
||||
folder_dir = self.directory / folder_name
|
||||
|
||||
images_length = images["count"]
|
||||
how_many_downloaded = 0
|
||||
duplicates = 0
|
||||
|
||||
try:
|
||||
if not os.path.exists(folder_dir):
|
||||
os.makedirs(folder_dir)
|
||||
except FileNotFoundError:
|
||||
folder_dir = self.directory / self.post['POSTID']
|
||||
os.makedirs(folder_dir)
|
||||
|
||||
print(folder_name)
|
||||
|
||||
for i in range(images_length):
|
||||
extension = self.validateExtension(images["images"][i]["ext"])
|
||||
image_url = self.imgur_image_domain + images["images"][i]["hash"] + extension
|
||||
filename = "_".join([str(i + 1),
|
||||
nameCorrector(images["images"][i]['title']),
|
||||
images["images"][i]['hash']]) + extension
|
||||
short_filename = str(i + 1) + "_" + images["images"][i]['hash']
|
||||
|
||||
print("\n ({}/{})".format(i + 1, images_length))
|
||||
|
||||
try:
|
||||
getFile(filename, short_filename, folder_dir, image_url, indent=2)
|
||||
how_many_downloaded += 1
|
||||
print()
|
||||
|
||||
except FileAlreadyExistsError:
|
||||
print(" The file already exists" + " " * 10, end="\n\n")
|
||||
duplicates += 1
|
||||
|
||||
except TypeInSkip:
|
||||
print(" Skipping...")
|
||||
how_many_downloaded += 1
|
||||
|
||||
except Exception as exception:
|
||||
print("\n Could not get the file")
|
||||
print(
|
||||
" " +
|
||||
"{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
|
||||
class_name=exception.__class__.__name__,
|
||||
info=str(exception)
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
print(GLOBAL.log_stream.getvalue(), no_print=True)
|
||||
|
||||
if duplicates == images_length:
|
||||
raise FileAlreadyExistsError
|
||||
elif how_many_downloaded + duplicates < images_length:
|
||||
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
|
||||
|
||||
def download(self, image: dict):
|
||||
extension = self.validateExtension(image["ext"])
|
||||
image_url = self.imgur_image_domain + image["hash"] + extension
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**self.post) + extension
|
||||
short_filename = self.post['POSTID'] + extension
|
||||
|
||||
getFile(filename, short_filename, self.directory, image_url)
|
||||
|
||||
@property
|
||||
def isAlbum(self) -> bool:
|
||||
return "album_images" in self.raw_data
|
||||
|
||||
@staticmethod
|
||||
def getData(link: str) -> dict:
|
||||
cookies = {"over18": "1", "postpagebeta": "0"}
|
||||
res = requests.get(link, cookies=cookies)
|
||||
if res.status_code != 200:
|
||||
raise ImageNotFound(f"Server responded with {res.status_code} to {link}")
|
||||
page_source = requests.get(link, cookies=cookies).text
|
||||
|
||||
starting_string = "image : "
|
||||
ending_string = "group :"
|
||||
|
||||
starting_string_lenght = len(starting_string)
|
||||
try:
|
||||
start_index = page_source.index(starting_string) + starting_string_lenght
|
||||
end_index = page_source.index(ending_string, start_index)
|
||||
except ValueError:
|
||||
raise NotADownloadableLinkError(
|
||||
f"Could not read the page source on {link}")
|
||||
|
||||
while page_source[end_index] != "}":
|
||||
end_index -= 1
|
||||
try:
|
||||
data = page_source[start_index:end_index + 2].strip()[:-1]
|
||||
except Exception:
|
||||
page_source[end_index + 1] = '}'
|
||||
data = page_source[start_index:end_index + 3].strip()[:-1]
|
||||
|
||||
return json.loads(data)
|
||||
|
||||
@staticmethod
|
||||
def validateExtension(string: str) -> str:
|
||||
possible_extensions = [".jpg", ".png", ".mp4", ".gif"]
|
||||
|
||||
for extension in possible_extensions:
|
||||
if extension in string:
|
||||
return extension
|
||||
else:
|
||||
raise ExtensionError(f"\"{string}\" is not recognized as a valid extension.")
|
||||
0
bulkredditdownloader/downloaders/__init__.py
Normal file
0
bulkredditdownloader/downloaders/__init__.py
Normal file
109
bulkredditdownloader/downloaders/downloaderUtils.py
Normal file
109
bulkredditdownloader/downloaders/downloaderUtils.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import hashlib
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
from bulkredditdownloader.errors import DomainInSkip, FailedToDownload, FileAlreadyExistsError, TypeInSkip
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
from bulkredditdownloader.utils import printToFile as print
|
||||
|
||||
|
||||
def dlProgress(count: int, block_size: int, total_size: int):
|
||||
"""Function for writing download progress to console
|
||||
"""
|
||||
download_mbs = int(count * block_size * (10 ** (-6)))
|
||||
file_size = int(total_size * (10 ** (-6)))
|
||||
sys.stdout.write("{}Mb/{}Mb\r".format(download_mbs, file_size))
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def getExtension(link: str):
|
||||
"""Extract file extension from image link.
|
||||
If didn't find any, return '.jpg'
|
||||
"""
|
||||
image_types = ['jpg', 'png', 'mp4', 'webm', 'gif']
|
||||
parsed = link.split('.')
|
||||
for fileType in image_types:
|
||||
if fileType in parsed:
|
||||
return "." + parsed[-1]
|
||||
else:
|
||||
if "v.redd.it" not in link:
|
||||
return '.jpg'
|
||||
else:
|
||||
return '.mp4'
|
||||
|
||||
|
||||
def getFile(filename: str, short_filename: str, folder_dir: Path, image_url: str, indent: int = 0, silent: bool = False):
|
||||
formats = {
|
||||
"videos": [".mp4", ".webm"],
|
||||
"images": [".jpg", ".jpeg", ".png", ".bmp"],
|
||||
"gifs": [".gif"],
|
||||
"self": []
|
||||
}
|
||||
|
||||
for file_type in GLOBAL.arguments.skip:
|
||||
for extension in formats[file_type]:
|
||||
if extension in filename:
|
||||
raise TypeInSkip
|
||||
|
||||
if any(domain in image_url for domain in GLOBAL.arguments.skip_domain):
|
||||
raise DomainInSkip
|
||||
|
||||
headers = [
|
||||
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "
|
||||
"Safari/537.36 OPR/54.0.2952.64"),
|
||||
("Accept", "text/html,application/xhtml+xml,application/xml;"
|
||||
"q=0.9,image/webp,image/apng,*/*;q=0.8"),
|
||||
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
|
||||
("Accept-Encoding", "none"),
|
||||
("Accept-Language", "en-US,en;q=0.8"),
|
||||
("Connection", "keep-alive")
|
||||
]
|
||||
|
||||
if not os.path.exists(folder_dir):
|
||||
os.makedirs(folder_dir)
|
||||
|
||||
opener = urllib.request.build_opener()
|
||||
if "imgur" not in image_url:
|
||||
opener.addheaders = headers
|
||||
urllib.request.install_opener(opener)
|
||||
|
||||
if not silent:
|
||||
print(" " * indent + str(folder_dir), " " * indent + str(filename), sep="\n")
|
||||
|
||||
for i in range(3):
|
||||
file_dir = Path(folder_dir) / filename
|
||||
temp_dir = Path(folder_dir) / (filename + ".tmp")
|
||||
|
||||
if not (os.path.isfile(file_dir)):
|
||||
try:
|
||||
urllib.request.urlretrieve(image_url, temp_dir, reporthook=dlProgress)
|
||||
|
||||
file_hash = createHash(temp_dir)
|
||||
if GLOBAL.arguments.no_dupes:
|
||||
if file_hash in GLOBAL.downloadedPosts():
|
||||
os.remove(temp_dir)
|
||||
raise FileAlreadyExistsError
|
||||
GLOBAL.downloadedPosts.add(file_hash)
|
||||
|
||||
os.rename(temp_dir, file_dir)
|
||||
if not silent:
|
||||
print(" " * indent + "Downloaded" + " " * 10)
|
||||
return None
|
||||
except ConnectionResetError:
|
||||
raise FailedToDownload
|
||||
except FileNotFoundError:
|
||||
filename = short_filename
|
||||
else:
|
||||
raise FileAlreadyExistsError
|
||||
raise FailedToDownload
|
||||
|
||||
|
||||
def createHash(filename: str) -> str:
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(filename, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
111
bulkredditdownloader/downloaders/gallery.py
Normal file
111
bulkredditdownloader/downloaders/gallery.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import json
|
||||
import os
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
import pathlib
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getFile
|
||||
from bulkredditdownloader.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError, ImageNotFound, NotADownloadableLinkError,
|
||||
TypeInSkip)
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
from bulkredditdownloader.utils import printToFile as print
|
||||
|
||||
|
||||
class Gallery:
|
||||
def __init__(self, directory: pathlib.Path, post):
|
||||
link = post['CONTENTURL']
|
||||
self.raw_data = self.getData(link)
|
||||
|
||||
self.directory = directory
|
||||
self.post = post
|
||||
|
||||
images = {}
|
||||
count = 0
|
||||
for model in self.raw_data['posts']['models']:
|
||||
try:
|
||||
for item in self.raw_data['posts']['models'][model]['media']['gallery']['items']:
|
||||
try:
|
||||
images[count] = {'id': item['mediaId'], 'url': self.raw_data['posts']
|
||||
['models'][model]['media']['mediaMetadata'][item['mediaId']]['s']['u']}
|
||||
count += 1
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
self.downloadAlbum(images, count)
|
||||
|
||||
@staticmethod
|
||||
def getData(link: str) -> dict:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
||||
}
|
||||
res = requests.get(link, headers=headers)
|
||||
if res.status_code != 200:
|
||||
raise ImageNotFound(f"Server responded with {res.status_code} to {link}")
|
||||
page_source = res.text
|
||||
|
||||
starting_string = "_r = {"
|
||||
ending_string = "</script>"
|
||||
|
||||
starting_string_lenght = len(starting_string)
|
||||
try:
|
||||
start_index = page_source.index(starting_string) + starting_string_lenght
|
||||
end_index = page_source.index(ending_string, start_index)
|
||||
except ValueError:
|
||||
raise NotADownloadableLinkError(f"Could not read the page source on {link}")
|
||||
|
||||
data = json.loads(page_source[start_index - 1:end_index + 1].strip()[:-1])
|
||||
return data
|
||||
|
||||
def downloadAlbum(self, images: dict, count: int):
|
||||
folder_name = GLOBAL.config['filename'].format(**self.post)
|
||||
folder_dir = self.directory / folder_name
|
||||
|
||||
how_many_downloaded = 0
|
||||
duplicates = 0
|
||||
|
||||
try:
|
||||
if not os.path.exists(folder_dir):
|
||||
os.makedirs(folder_dir)
|
||||
except FileNotFoundError:
|
||||
folder_dir = self.directory / self.post['POSTID']
|
||||
os.makedirs(folder_dir)
|
||||
|
||||
print(folder_name)
|
||||
|
||||
for i in range(count):
|
||||
path = urllib.parse.urlparse(images[i]['url']).path
|
||||
extension = os.path.splitext(path)[1]
|
||||
|
||||
filename = "_".join([str(i + 1), images[i]['id']]) + extension
|
||||
short_filename = str(i + 1) + "_" + images[i]['id']
|
||||
|
||||
print("\n ({}/{})".format(i + 1, count))
|
||||
|
||||
try:
|
||||
getFile(filename, short_filename, folder_dir, images[i]['url'], indent=2)
|
||||
how_many_downloaded += 1
|
||||
print()
|
||||
|
||||
except FileAlreadyExistsError:
|
||||
print(" The file already exists" + " " * 10, end="\n\n")
|
||||
duplicates += 1
|
||||
|
||||
except TypeInSkip:
|
||||
print(" Skipping...")
|
||||
how_many_downloaded += 1
|
||||
|
||||
except Exception as exception:
|
||||
print("\n Could not get the file")
|
||||
print(" " + "{class_name}: {info}\nSee CONSOLE_LOG.txt for more information".format(
|
||||
class_name=exception.__class__.__name__, info=str(exception)) + "\n"
|
||||
)
|
||||
print(GLOBAL.log_stream.getvalue(), no_print=True)
|
||||
|
||||
if duplicates == count:
|
||||
raise FileAlreadyExistsError
|
||||
elif how_many_downloaded + duplicates < count:
|
||||
raise AlbumNotDownloadedCompletely("Album Not Downloaded Completely")
|
||||
50
bulkredditdownloader/downloaders/gifDeliveryNetwork.py
Normal file
50
bulkredditdownloader/downloaders/gifDeliveryNetwork.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import os
|
||||
import pathlib
|
||||
import urllib.request
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
|
||||
|
||||
class GifDeliveryNetwork:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
try:
|
||||
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
|
||||
except IndexError:
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
post['EXTENSION'] = getExtension(post['MEDIAURL'])
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
|
||||
short_filename = post['POSTID'] + post['EXTENSION']
|
||||
|
||||
getFile(filename, short_filename, directory, post['MEDIAURL'])
|
||||
|
||||
@staticmethod
|
||||
def getLink(url: str) -> str:
|
||||
"""Extract direct link to the video from page's source
|
||||
and return it
|
||||
"""
|
||||
if '.webm' in url.split('/')[-1] or '.mp4' in url.split('/')[-1] or '.gif' in url.split('/')[-1]:
|
||||
return url
|
||||
|
||||
if url[-1:] == '/':
|
||||
url = url[:-1]
|
||||
|
||||
url = "https://www.gifdeliverynetwork.com/" + url.split('/')[-1]
|
||||
page_source = (urllib.request.urlopen(url).read().decode())
|
||||
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
attributes = {"id": "mp4Source", "type": "video/mp4"}
|
||||
content = soup.find("source", attrs=attributes)
|
||||
|
||||
if content is None:
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
return content["src"]
|
||||
57
bulkredditdownloader/downloaders/redgifs.py
Normal file
57
bulkredditdownloader/downloaders/redgifs.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import urllib.request
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getExtension, getFile
|
||||
from bulkredditdownloader.errors import NotADownloadableLinkError
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
|
||||
|
||||
class Redgifs:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
try:
|
||||
post['MEDIAURL'] = self.getLink(post['CONTENTURL'])
|
||||
except IndexError:
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
post['EXTENSION'] = getExtension(post['MEDIAURL'])
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**post) + post["EXTENSION"]
|
||||
short_filename = post['POSTID'] + post['EXTENSION']
|
||||
|
||||
getFile(filename, short_filename, directory, post['MEDIAURL'])
|
||||
|
||||
@staticmethod
|
||||
def getLink(url: str) -> str:
|
||||
"""Extract direct link to the video from page's source
|
||||
and return it
|
||||
"""
|
||||
if '.webm' in url or '.mp4' in url or '.gif' in url:
|
||||
return url
|
||||
|
||||
if url[-1:] == '/':
|
||||
url = url[:-1]
|
||||
|
||||
url = urllib.request.Request(
|
||||
"https://redgifs.com/watch/" + url.split('/')[-1])
|
||||
|
||||
url.add_header(
|
||||
'User-Agent',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64')
|
||||
|
||||
page_source = (urllib.request.urlopen(url).read().decode())
|
||||
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
attributes = {"data-react-helmet": "true", "type": "application/ld+json"}
|
||||
content = soup.find("script", attrs=attributes)
|
||||
|
||||
if content is None:
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
return json.loads(content.contents[0])["video"]["contentUrl"]
|
||||
61
bulkredditdownloader/downloaders/selfPost.py
Normal file
61
bulkredditdownloader/downloaders/selfPost.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from src.utils import printToFile as print
|
||||
import io
|
||||
import os
|
||||
import pathlib
|
||||
from pathlib import Path
|
||||
|
||||
from bulkredditdownloader.errors import FileAlreadyExistsError, TypeInSkip
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
from bulkredditdownloader.utils import printToFile as print
|
||||
|
||||
VanillaPrint = print
|
||||
|
||||
|
||||
class SelfPost:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
if "self" in GLOBAL.arguments.skip:
|
||||
raise TypeInSkip
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**post)
|
||||
|
||||
file_dir = directory / (filename + ".md")
|
||||
print(file_dir)
|
||||
print(filename + ".md")
|
||||
|
||||
if Path.is_file(file_dir):
|
||||
raise FileAlreadyExistsError
|
||||
|
||||
try:
|
||||
self.writeToFile(file_dir, post)
|
||||
except FileNotFoundError:
|
||||
file_dir = post['POSTID'] + ".md"
|
||||
file_dir = directory / file_dir
|
||||
|
||||
self.writeToFile(file_dir, post)
|
||||
|
||||
@staticmethod
|
||||
def writeToFile(directory: pathlib.Path, post: dict):
|
||||
"""Self posts are formatted here"""
|
||||
content = ("## ["
|
||||
+ post["TITLE"]
|
||||
+ "]("
|
||||
+ post["CONTENTURL"]
|
||||
+ ")\n"
|
||||
+ post["CONTENT"]
|
||||
+ "\n\n---\n\n"
|
||||
+ "submitted to [r/"
|
||||
+ post["SUBREDDIT"]
|
||||
+ "](https://www.reddit.com/r/"
|
||||
+ post["SUBREDDIT"]
|
||||
+ ") by [u/"
|
||||
+ post["REDDITOR"]
|
||||
+ "](https://www.reddit.com/user/"
|
||||
+ post["REDDITOR"]
|
||||
+ ")")
|
||||
|
||||
with io.open(directory, "w", encoding="utf-8") as FILE:
|
||||
VanillaPrint(content, file=FILE)
|
||||
print("Downloaded")
|
||||
58
bulkredditdownloader/downloaders/vreddit.py
Normal file
58
bulkredditdownloader/downloaders/vreddit.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import getFile
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
from bulkredditdownloader.utils import printToFile as print
|
||||
|
||||
|
||||
class VReddit:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
extension = ".mp4"
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**post) + extension
|
||||
short_filename = post['POSTID'] + extension
|
||||
|
||||
try:
|
||||
fnull = open(os.devnull, 'w')
|
||||
subprocess.call("ffmpeg", stdout=fnull, stderr=subprocess.STDOUT)
|
||||
except Exception:
|
||||
getFile(filename, short_filename, directory, post['CONTENTURL'])
|
||||
print("FFMPEG library not found, skipping merging video and audio")
|
||||
else:
|
||||
video_name = post['POSTID'] + "_video"
|
||||
video_url = post['CONTENTURL']
|
||||
audio_name = post['POSTID'] + "_audio"
|
||||
audio_url = video_url[:video_url.rfind('/')] + '/DASH_audio.mp4'
|
||||
|
||||
print(directory, filename, sep="\n")
|
||||
|
||||
getFile(video_name, video_name, directory, video_url, silent=True)
|
||||
getFile(audio_name, audio_name, directory, audio_url, silent=True)
|
||||
try:
|
||||
self._mergeAudio(video_name, audio_name, filename, short_filename, directory)
|
||||
except KeyboardInterrupt:
|
||||
os.remove(directory / filename)
|
||||
os.remove(directory / audio_name)
|
||||
os.rename(directory / video_name, directory / filename)
|
||||
|
||||
@staticmethod
|
||||
def _mergeAudio(
|
||||
video: pathlib.Path,
|
||||
audio: pathlib.Path,
|
||||
filename: pathlib.Path,
|
||||
short_filename,
|
||||
directory: pathlib.Path):
|
||||
input_video = str(directory / video)
|
||||
input_audio = str(directory / audio)
|
||||
|
||||
fnull = open(os.devnull, 'w')
|
||||
cmd = "ffmpeg -i {} -i {} -c:v copy -c:a aac -strict experimental {}".format(
|
||||
input_audio, input_video, str(directory / filename))
|
||||
subprocess.call(cmd.split(), stdout=fnull, stderr=subprocess.STDOUT)
|
||||
|
||||
os.remove(directory / video)
|
||||
os.remove(directory / audio)
|
||||
55
bulkredditdownloader/downloaders/youtube.py
Normal file
55
bulkredditdownloader/downloaders/youtube.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import os
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import youtube_dl
|
||||
|
||||
from bulkredditdownloader.downloaders.downloaderUtils import createHash
|
||||
from bulkredditdownloader.errors import FileAlreadyExistsError
|
||||
from bulkredditdownloader.utils import GLOBAL
|
||||
from bulkredditdownloader.utils import printToFile as print
|
||||
|
||||
|
||||
|
||||
class Youtube:
|
||||
def __init__(self, directory: pathlib.Path, post: dict):
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
filename = GLOBAL.config['filename'].format(**post)
|
||||
print(filename)
|
||||
|
||||
self.download(filename, directory, post['CONTENTURL'])
|
||||
|
||||
def download(self, filename: str, directory: pathlib.Path, url: str):
|
||||
ydl_opts = {
|
||||
"format": "best",
|
||||
"outtmpl": str(directory / (filename + ".%(ext)s")),
|
||||
"progress_hooks": [self._hook],
|
||||
"playlistend": 1,
|
||||
"nooverwrites": True,
|
||||
"quiet": True
|
||||
}
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([url])
|
||||
|
||||
location = directory / (filename + ".mp4")
|
||||
|
||||
if GLOBAL.arguments.no_dupes:
|
||||
try:
|
||||
file_hash = createHash(str(location))
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
if file_hash in GLOBAL.downloadedPosts():
|
||||
os.remove(location)
|
||||
raise FileAlreadyExistsError
|
||||
GLOBAL.downloadedPosts.add(file_hash)
|
||||
|
||||
@staticmethod
|
||||
def _hook(d):
|
||||
if d['status'] == 'finished':
|
||||
return print("Downloaded")
|
||||
downloaded_mbs = int(d['downloaded_bytes'] * (10**(-6)))
|
||||
file_size = int(d['total_bytes'] * (10**(-6)))
|
||||
sys.stdout.write("{}Mb/{}Mb\r".format(downloaded_mbs, file_size))
|
||||
sys.stdout.flush()
|
||||
Reference in New Issue
Block a user