Add defensive programming to site downloaders

This commit is contained in:
Serene-Arc
2021-04-06 11:04:08 +10:00
committed by Ali Parlakci
parent 9cb4dd4cf3
commit a291104144
6 changed files with 47 additions and 20 deletions

View File

@@ -7,7 +7,7 @@ from typing import Optional
import bs4 import bs4
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.exceptions import SiteDownloaderError
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@@ -21,8 +21,9 @@ class Erome(BaseDownloader):
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
links = self._get_links(self.post.url) links = self._get_links(self.post.url)
if not links: if not links:
raise NotADownloadableLinkError('Erome parser could not find any links') raise SiteDownloaderError('Erome parser could not find any links')
out = [] out = []
for link in links: for link in links:

View File

@@ -7,7 +7,7 @@ from typing import Optional
import bs4 import bs4
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import ResourceNotFound from bulkredditdownloader.exceptions import SiteDownloaderError
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@@ -22,7 +22,7 @@ class Gallery(BaseDownloader):
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
image_urls = self._get_links(self.post.url) image_urls = self._get_links(self.post.url)
if not image_urls: if not image_urls:
raise ResourceNotFound('No images found in Reddit gallery') raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls] return [Resource(self.post, url) for url in image_urls]
@staticmethod @staticmethod

View File

@@ -7,6 +7,7 @@ from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import SiteDownloaderError
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
@@ -31,5 +32,10 @@ class Gfycat(GifDeliveryNetwork):
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
out = json.loads(content.contents[0]).get('video').get('contentUrl') try:
out = json.loads(content.contents[0])['video']['contentUrl']
except (IndexError, KeyError) as e:
raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}')
except json.JSONDecodeError as e:
raise SiteDownloaderError(f'Did not receive valid JSON data: {e}')
return out return out

View File

@@ -5,7 +5,7 @@ from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
@@ -26,7 +26,11 @@ class GifDeliveryNetwork(BaseDownloader):
soup = BeautifulSoup(page.text, 'html.parser') soup = BeautifulSoup(page.text, 'html.parser')
content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'}) content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'})
if content is None or content.get('src') is None: try:
raise NotADownloadableLinkError('Could not read the page source') out = content['src']
if not out:
raise KeyError
except KeyError:
raise SiteDownloaderError('Could not find source link')
return content.get('src') return out

View File

@@ -50,17 +50,23 @@ class Imgur(BaseDownloader):
script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'') script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'')
chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts)) chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
if len(chosen_script) != 1: if len(chosen_script) != 1:
raise NotADownloadableLinkError(f'Could not read page source from {link}') raise SiteDownloaderError(f'Could not read page source from {link}')
else:
chosen_script = chosen_script[0] chosen_script = chosen_script[0]
outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);') outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);')
image_dict = re.search(outer_regex, chosen_script).group(1)
inner_regex = re.compile(r'image\s*:(.*),\s*group') inner_regex = re.compile(r'image\s*:(.*),\s*group')
try:
image_dict = re.search(outer_regex, chosen_script).group(1)
image_dict = re.search(inner_regex, image_dict).group(1) image_dict = re.search(inner_regex, image_dict).group(1)
except AttributeError:
raise SiteDownloaderError(f'Could not find image dictionary in page source')
try:
image_dict = json.loads(image_dict) image_dict = json.loads(image_dict)
except json.JSONDecodeError as e:
raise SiteDownloaderError(f'Could not parse received dict as JSON: {e}')
return image_dict return image_dict
@staticmethod @staticmethod

View File

@@ -7,7 +7,7 @@ from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from praw.models import Submission from praw.models import Submission
from bulkredditdownloader.exceptions import NotADownloadableLinkError from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError
from bulkredditdownloader.resource import Resource from bulkredditdownloader.resource import Resource
from bulkredditdownloader.site_authenticator import SiteAuthenticator from bulkredditdownloader.site_authenticator import SiteAuthenticator
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
@@ -22,7 +22,11 @@ class Redgifs(GifDeliveryNetwork):
@staticmethod @staticmethod
def _get_link(url: str) -> str: def _get_link(url: str) -> str:
try:
redgif_id = re.match(r'.*/(.*?)/?$', url).group(1) redgif_id = re.match(r'.*/(.*?)/?$', url).group(1)
except AttributeError:
raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}')
url = 'https://redgifs.com/watch/' + redgif_id url = 'https://redgifs.com/watch/' + redgif_id
headers = { headers = {
@@ -36,7 +40,13 @@ class Redgifs(GifDeliveryNetwork):
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'}) content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
if content is None: if content is None:
raise NotADownloadableLinkError('Could not read the page source') raise SiteDownloaderError('Could not read the page source')
try:
out = json.loads(content.contents[0])['video']['contentUrl'] out = json.loads(content.contents[0])['video']['contentUrl']
except (IndexError, KeyError):
raise SiteDownloaderError('Failed to find JSON data in page')
except json.JSONDecodeError as e:
raise SiteDownloaderError(f'Received data was not valid JSON: {e}')
return out return out