Add defensive programming to site downloaders
This commit is contained in:
@@ -7,7 +7,7 @@ from typing import Optional
|
|||||||
import bs4
|
import bs4
|
||||||
from praw.models import Submission
|
from praw.models import Submission
|
||||||
|
|
||||||
from bulkredditdownloader.exceptions import NotADownloadableLinkError
|
from bulkredditdownloader.exceptions import SiteDownloaderError
|
||||||
from bulkredditdownloader.resource import Resource
|
from bulkredditdownloader.resource import Resource
|
||||||
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
||||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||||
@@ -21,8 +21,9 @@ class Erome(BaseDownloader):
|
|||||||
|
|
||||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||||
links = self._get_links(self.post.url)
|
links = self._get_links(self.post.url)
|
||||||
|
|
||||||
if not links:
|
if not links:
|
||||||
raise NotADownloadableLinkError('Erome parser could not find any links')
|
raise SiteDownloaderError('Erome parser could not find any links')
|
||||||
|
|
||||||
out = []
|
out = []
|
||||||
for link in links:
|
for link in links:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from typing import Optional
|
|||||||
import bs4
|
import bs4
|
||||||
from praw.models import Submission
|
from praw.models import Submission
|
||||||
|
|
||||||
from bulkredditdownloader.exceptions import ResourceNotFound
|
from bulkredditdownloader.exceptions import SiteDownloaderError
|
||||||
from bulkredditdownloader.resource import Resource
|
from bulkredditdownloader.resource import Resource
|
||||||
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
||||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||||
@@ -22,7 +22,7 @@ class Gallery(BaseDownloader):
|
|||||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||||
image_urls = self._get_links(self.post.url)
|
image_urls = self._get_links(self.post.url)
|
||||||
if not image_urls:
|
if not image_urls:
|
||||||
raise ResourceNotFound('No images found in Reddit gallery')
|
raise SiteDownloaderError('No images found in Reddit gallery')
|
||||||
return [Resource(self.post, url) for url in image_urls]
|
return [Resource(self.post, url) for url in image_urls]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from typing import Optional
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from praw.models import Submission
|
from praw.models import Submission
|
||||||
|
|
||||||
|
from bulkredditdownloader.exceptions import SiteDownloaderError
|
||||||
from bulkredditdownloader.resource import Resource
|
from bulkredditdownloader.resource import Resource
|
||||||
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
||||||
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
||||||
@@ -31,5 +32,10 @@ class Gfycat(GifDeliveryNetwork):
|
|||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
|
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
|
||||||
|
|
||||||
out = json.loads(content.contents[0]).get('video').get('contentUrl')
|
try:
|
||||||
|
out = json.loads(content.contents[0])['video']['contentUrl']
|
||||||
|
except (IndexError, KeyError) as e:
|
||||||
|
raise SiteDownloaderError(f'Failed to download Gfycat link {url}: {e}')
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise SiteDownloaderError(f'Did not receive valid JSON data: {e}')
|
||||||
return out
|
return out
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from typing import Optional
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from praw.models import Submission
|
from praw.models import Submission
|
||||||
|
|
||||||
from bulkredditdownloader.exceptions import NotADownloadableLinkError
|
from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||||
from bulkredditdownloader.resource import Resource
|
from bulkredditdownloader.resource import Resource
|
||||||
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
||||||
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
from bulkredditdownloader.site_downloaders.base_downloader import BaseDownloader
|
||||||
@@ -26,7 +26,11 @@ class GifDeliveryNetwork(BaseDownloader):
|
|||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, 'html.parser')
|
||||||
content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'})
|
content = soup.find('source', attrs={'id': 'mp4Source', 'type': 'video/mp4'})
|
||||||
|
|
||||||
if content is None or content.get('src') is None:
|
try:
|
||||||
raise NotADownloadableLinkError('Could not read the page source')
|
out = content['src']
|
||||||
|
if not out:
|
||||||
|
raise KeyError
|
||||||
|
except KeyError:
|
||||||
|
raise SiteDownloaderError('Could not find source link')
|
||||||
|
|
||||||
return content.get('src')
|
return out
|
||||||
|
|||||||
@@ -50,17 +50,23 @@ class Imgur(BaseDownloader):
|
|||||||
script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'')
|
script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'')
|
||||||
chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
|
chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
|
||||||
if len(chosen_script) != 1:
|
if len(chosen_script) != 1:
|
||||||
raise NotADownloadableLinkError(f'Could not read page source from {link}')
|
raise SiteDownloaderError(f'Could not read page source from {link}')
|
||||||
else:
|
|
||||||
chosen_script = chosen_script[0]
|
chosen_script = chosen_script[0]
|
||||||
|
|
||||||
outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);')
|
outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);')
|
||||||
image_dict = re.search(outer_regex, chosen_script).group(1)
|
|
||||||
|
|
||||||
inner_regex = re.compile(r'image\s*:(.*),\s*group')
|
inner_regex = re.compile(r'image\s*:(.*),\s*group')
|
||||||
image_dict = re.search(inner_regex, image_dict).group(1)
|
try:
|
||||||
|
image_dict = re.search(outer_regex, chosen_script).group(1)
|
||||||
|
image_dict = re.search(inner_regex, image_dict).group(1)
|
||||||
|
except AttributeError:
|
||||||
|
raise SiteDownloaderError(f'Could not find image dictionary in page source')
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_dict = json.loads(image_dict)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise SiteDownloaderError(f'Could not parse received dict as JSON: {e}')
|
||||||
|
|
||||||
image_dict = json.loads(image_dict)
|
|
||||||
return image_dict
|
return image_dict
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from typing import Optional
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from praw.models import Submission
|
from praw.models import Submission
|
||||||
|
|
||||||
from bulkredditdownloader.exceptions import NotADownloadableLinkError
|
from bulkredditdownloader.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||||
from bulkredditdownloader.resource import Resource
|
from bulkredditdownloader.resource import Resource
|
||||||
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
from bulkredditdownloader.site_authenticator import SiteAuthenticator
|
||||||
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
from bulkredditdownloader.site_downloaders.gif_delivery_network import GifDeliveryNetwork
|
||||||
@@ -22,7 +22,11 @@ class Redgifs(GifDeliveryNetwork):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_link(url: str) -> str:
|
def _get_link(url: str) -> str:
|
||||||
redgif_id = re.match(r'.*/(.*?)/?$', url).group(1)
|
try:
|
||||||
|
redgif_id = re.match(r'.*/(.*?)/?$', url).group(1)
|
||||||
|
except AttributeError:
|
||||||
|
raise SiteDownloaderError(f'Could not extract Redgifs ID from {url}')
|
||||||
|
|
||||||
url = 'https://redgifs.com/watch/' + redgif_id
|
url = 'https://redgifs.com/watch/' + redgif_id
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
@@ -36,7 +40,13 @@ class Redgifs(GifDeliveryNetwork):
|
|||||||
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
|
content = soup.find('script', attrs={'data-react-helmet': 'true', 'type': 'application/ld+json'})
|
||||||
|
|
||||||
if content is None:
|
if content is None:
|
||||||
raise NotADownloadableLinkError('Could not read the page source')
|
raise SiteDownloaderError('Could not read the page source')
|
||||||
|
|
||||||
|
try:
|
||||||
|
out = json.loads(content.contents[0])['video']['contentUrl']
|
||||||
|
except (IndexError, KeyError):
|
||||||
|
raise SiteDownloaderError('Failed to find JSON data in page')
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise SiteDownloaderError(f'Received data was not valid JSON: {e}')
|
||||||
|
|
||||||
out = json.loads(content.contents[0])['video']['contentUrl']
|
|
||||||
return out
|
return out
|
||||||
|
|||||||
Reference in New Issue
Block a user