Rename module
This commit is contained in:
0
bdfr/site_downloaders/__init__.py
Normal file
0
bdfr/site_downloaders/__init__.py
Normal file
33
bdfr/site_downloaders/base_downloader.py
Normal file
33
bdfr/site_downloaders/base_downloader.py
Normal file
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import ResourceNotFound
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseDownloader(ABC):
|
||||
def __init__(self, post: Submission, typical_extension: Optional[str] = None):
|
||||
self.post = post
|
||||
self.typical_extension = typical_extension
|
||||
|
||||
@abstractmethod
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
"""Return list of all un-downloaded Resources from submission"""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response:
|
||||
res = requests.get(url, cookies=cookies, headers=headers)
|
||||
if res.status_code != 200:
|
||||
raise ResourceNotFound(f'Server responded with {res.status_code} to {url}')
|
||||
return res
|
||||
17
bdfr/site_downloaders/direct.py
Normal file
17
bdfr/site_downloaders/direct.py
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class Direct(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
return [Resource(self.post, self.post.url)]
|
||||
50
bdfr/site_downloaders/download_factory.py
Normal file
50
bdfr/site_downloaders/download_factory.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
import re
|
||||
from typing import Type
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
from bdfr.site_downloaders.direct import Direct
|
||||
from bdfr.site_downloaders.erome import Erome
|
||||
from bdfr.site_downloaders.gallery import Gallery
|
||||
from bdfr.site_downloaders.gfycat import Gfycat
|
||||
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
||||
from bdfr.site_downloaders.imgur import Imgur
|
||||
from bdfr.site_downloaders.redgifs import Redgifs
|
||||
from bdfr.site_downloaders.self_post import SelfPost
|
||||
from bdfr.site_downloaders.vreddit import VReddit
|
||||
from bdfr.site_downloaders.youtube import Youtube
|
||||
|
||||
|
||||
class DownloadFactory:
|
||||
@staticmethod
|
||||
def pull_lever(url: str) -> Type[BaseDownloader]:
|
||||
url_beginning = r'\s*(https?://(www\.)?)'
|
||||
if re.match(url_beginning + r'(i\.)?imgur.*\.gifv$', url):
|
||||
return Imgur
|
||||
elif re.match(url_beginning + r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', url):
|
||||
return Direct
|
||||
elif re.match(url_beginning + r'erome\.com.*', url):
|
||||
return Erome
|
||||
elif re.match(url_beginning + r'reddit\.com/gallery/.*', url):
|
||||
return Gallery
|
||||
elif re.match(url_beginning + r'gfycat\.', url):
|
||||
return Gfycat
|
||||
elif re.match(url_beginning + r'gifdeliverynetwork', url):
|
||||
return GifDeliveryNetwork
|
||||
elif re.match(url_beginning + r'(m\.)?imgur.*', url):
|
||||
return Imgur
|
||||
elif re.match(url_beginning + r'redgifs.com', url):
|
||||
return Redgifs
|
||||
elif re.match(url_beginning + r'reddit\.com/r/', url):
|
||||
return SelfPost
|
||||
elif re.match(url_beginning + r'v\.redd\.it', url):
|
||||
return VReddit
|
||||
elif re.match(url_beginning + r'(m\.)?youtu\.?be', url):
|
||||
return Youtube
|
||||
elif re.match(url_beginning + r'i\.redd\.it.*', url):
|
||||
return Direct
|
||||
else:
|
||||
raise NotADownloadableLinkError(f'No downloader module exists for url {url}')
|
||||
45
bdfr/site_downloaders/erome.py
Normal file
45
bdfr/site_downloaders/erome.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import bs4
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Erome(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
links = self._get_links(self.post.url)
|
||||
|
||||
if not links:
|
||||
raise SiteDownloaderError('Erome parser could not find any links')
|
||||
|
||||
out = []
|
||||
for link in links:
|
||||
if not re.match(r'https?://.*', link):
|
||||
link = 'https://' + link
|
||||
out.append(Resource(self.post, link))
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _get_links(url: str) -> set[str]:
|
||||
page = Erome.retrieve_url(url)
|
||||
soup = bs4.BeautifulSoup(page.text, 'html.parser')
|
||||
front_images = soup.find_all('img', attrs={'class': 'lasyload'})
|
||||
out = [im.get('data-src') for im in front_images]
|
||||
|
||||
videos = soup.find_all('source')
|
||||
out.extend([vid.get('src') for vid in videos])
|
||||
|
||||
return set(out)
|
||||
40
bdfr/site_downloaders/gallery.py
Normal file
40
bdfr/site_downloaders/gallery.py
Normal file
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import bs4
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Gallery(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
image_urls = self._get_links(self.post.url)
|
||||
if not image_urls:
|
||||
raise SiteDownloaderError('No images found in Reddit gallery')
|
||||
return [Resource(self.post, url) for url in image_urls]
|
||||
|
||||
@staticmethod
|
||||
def _get_links(url: str) -> list[str]:
|
||||
resource_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
||||
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
}
|
||||
page = Gallery.retrieve_url(url, headers=resource_headers)
|
||||
soup = bs4.BeautifulSoup(page.text, 'html.parser')
|
||||
|
||||
links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')})
|
||||
links = [link.get('href') for link in links]
|
||||
return links
|
||||
41
bdfr/site_downloaders/gfycat.py
Normal file
41
bdfr/site_downloaders/gfycat.py
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
||||
|
||||
|
||||
class Gfycat(GifDeliveryNetwork):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
return super().find_resources(authenticator)
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
gfycat_id = re.match(r'.*/(.*?)/?$', url).group(1)
|
||||
url = 'https://gfycat.com/' + gfycat_id
|
||||
|
||||
response = Gfycat.retrieve_url(url)
|
||||
if 'gifdeliverynetwork' in response.url:
|
||||
return GifDeliveryNetwork._get_link(url)
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
|
||||
|
||||
try:
|
||||
out = json.loads(content.contents[0])['video']['contentUrl']
|
||||
except (IndexError, KeyError) as e:
|
||||
raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}')
|
||||
except json.JSONDecodeError as e:
|
||||
raise SiteDownloaderError(f'Did not receive valid JSON data: {e}')
|
||||
return out
|
||||
36
bdfr/site_downloaders/gif_delivery_network.py
Normal file
36
bdfr/site_downloaders/gif_delivery_network.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class GifDeliveryNetwork(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
media_url = self._get_link(self.post.url)
|
||||
return [Resource(self.post, media_url, '.mp4')]
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
page = GifDeliveryNetwork.retrieve_url(url)
|
||||
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'})
|
||||
|
||||
try:
|
||||
out = content['src']
|
||||
if not out:
|
||||
raise KeyError
|
||||
except KeyError:
|
||||
raise SiteDownloaderError('Could not find source link')
|
||||
|
||||
return out
|
||||
79
bdfr/site_downloaders/imgur.py
Normal file
79
bdfr/site_downloaders/imgur.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import bs4
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class Imgur(BaseDownloader):
|
||||
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
self.raw_data = {}
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
self.raw_data = self._get_data(self.post.url)
|
||||
|
||||
out = []
|
||||
if 'album_images' in self.raw_data:
|
||||
images = self.raw_data['album_images']
|
||||
for image in images['images']:
|
||||
out.append(self._download_image(image))
|
||||
else:
|
||||
out.append(self._download_image(self.raw_data))
|
||||
return out
|
||||
|
||||
def _download_image(self, image: dict) -> Resource:
|
||||
image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
|
||||
return Resource(self.post, image_url)
|
||||
|
||||
@staticmethod
|
||||
def _get_data(link: str) -> dict:
|
||||
if re.match(r'.*\.gifv$', link):
|
||||
link = link.replace('i.imgur', 'imgur')
|
||||
link = link.rstrip('.gifv')
|
||||
|
||||
res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'})
|
||||
|
||||
soup = bs4.BeautifulSoup(res.text, 'html.parser')
|
||||
scripts = soup.find_all('script', attrs={'type': 'text/javascript'})
|
||||
scripts = [script.string.replace('\n', '') for script in scripts if script.string]
|
||||
|
||||
script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'')
|
||||
chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
|
||||
if len(chosen_script) != 1:
|
||||
raise SiteDownloaderError(f'Could not read page source from {link}')
|
||||
|
||||
chosen_script = chosen_script[0]
|
||||
|
||||
outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);')
|
||||
inner_regex = re.compile(r'image\s*:(.*),\s*group')
|
||||
try:
|
||||
image_dict = re.search(outer_regex, chosen_script).group(1)
|
||||
image_dict = re.search(inner_regex, image_dict).group(1)
|
||||
except AttributeError:
|
||||
raise SiteDownloaderError(f'Could not find image dictionary in page source')
|
||||
|
||||
try:
|
||||
image_dict = json.loads(image_dict)
|
||||
except json.JSONDecodeError as e:
|
||||
raise SiteDownloaderError(f'Could not parse received dict as JSON: {e}')
|
||||
|
||||
return image_dict
|
||||
|
||||
@staticmethod
|
||||
def _validate_extension(extension_suffix: str) -> str:
|
||||
possible_extensions = ('.jpg', '.png', '.mp4', '.gif')
|
||||
selection = [ext for ext in possible_extensions if ext == extension_suffix]
|
||||
if len(selection) == 1:
|
||||
return selection[0]
|
||||
else:
|
||||
raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')
|
||||
52
bdfr/site_downloaders/redgifs.py
Normal file
52
bdfr/site_downloaders/redgifs.py
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
||||
|
||||
|
||||
class Redgifs(GifDeliveryNetwork):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
return super().find_resources(authenticator)
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
try:
|
||||
redgif_id = re.match(r'.*/(.*?)/?$', url).group(1)
|
||||
except AttributeError:
|
||||
raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}')
|
||||
|
||||
url = 'https://redgifs.com/watch/' + redgif_id
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
||||
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
|
||||
}
|
||||
|
||||
page = Redgifs.retrieve_url(url, headers=headers)
|
||||
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
|
||||
|
||||
if content is None:
|
||||
raise SiteDownloaderError('Could not read the page source')
|
||||
|
||||
try:
|
||||
out = json.loads(content.contents[0])['video']['contentUrl']
|
||||
except (IndexError, KeyError):
|
||||
raise SiteDownloaderError('Failed to find JSON data in page')
|
||||
except json.JSONDecodeError as e:
|
||||
raise SiteDownloaderError(f'Received data was not valid JSON: {e}')
|
||||
|
||||
return out
|
||||
43
bdfr/site_downloaders/self_post.py
Normal file
43
bdfr/site_downloaders/self_post.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SelfPost(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = Resource(self.post, self.post.url, '.txt')
|
||||
out.content = self.export_to_string().encode('utf-8')
|
||||
out.create_hash()
|
||||
return [out]
|
||||
|
||||
def export_to_string(self) -> str:
|
||||
"""Self posts are formatted here"""
|
||||
content = ("## ["
|
||||
+ self.post.fullname
|
||||
+ "]("
|
||||
+ self.post.url
|
||||
+ ")\n"
|
||||
+ self.post.selftext
|
||||
+ "\n\n---\n\n"
|
||||
+ "submitted to [r/"
|
||||
+ self.post.subreddit.title
|
||||
+ "](https://www.reddit.com/r/"
|
||||
+ self.post.subreddit.title
|
||||
+ ") by [u/"
|
||||
+ (self.post.author.name if self.post.author else "DELETED")
|
||||
+ "](https://www.reddit.com/user/"
|
||||
+ (self.post.author.name if self.post.author else "DELETED")
|
||||
+ ")")
|
||||
return content
|
||||
21
bdfr/site_downloaders/vreddit.py
Normal file
21
bdfr/site_downloaders/vreddit.py
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.youtube import Youtube
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VReddit(Youtube):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = super()._download_video({})
|
||||
return [out]
|
||||
50
bdfr/site_downloaders/youtube.py
Normal file
50
bdfr/site_downloaders/youtube.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import youtube_dl
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Youtube(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
ytdl_options = {
|
||||
'format': 'best',
|
||||
'playlistend': 1,
|
||||
'nooverwrites': True,
|
||||
}
|
||||
out = self._download_video(ytdl_options)
|
||||
return [out]
|
||||
|
||||
def _download_video(self, ytdl_options: dict) -> Resource:
|
||||
ytdl_options['quiet'] = True
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
download_path = Path(temp_dir).resolve()
|
||||
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
|
||||
ydl.download([self.post.url])
|
||||
except youtube_dl.DownloadError as e:
|
||||
raise SiteDownloaderError(f'Youtube download failed: {e}')
|
||||
|
||||
downloaded_file = list(download_path.iterdir())[0]
|
||||
extension = downloaded_file.suffix
|
||||
with open(downloaded_file, 'rb') as file:
|
||||
content = file.read()
|
||||
out = Resource(self.post, self.post.url, extension)
|
||||
out.content = content
|
||||
out.create_hash()
|
||||
return out
|
||||
Reference in New Issue
Block a user