Implement callbacks for downloading
This commit is contained in:
@@ -76,17 +76,17 @@ class Archiver(RedditConnector):
|
||||
logger.info(f'Record for entry item {praw_item.id} written to disk')
|
||||
|
||||
def _write_entry_json(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, '', '.json')
|
||||
resource = Resource(entry.source, '', lambda: None, '.json')
|
||||
content = json.dumps(entry.compile())
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
def _write_entry_xml(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, '', '.xml')
|
||||
resource = Resource(entry.source, '', lambda: None, '.xml')
|
||||
content = dict2xml.dict2xml(entry.compile(), wrap='root')
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
def _write_entry_yaml(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, '', '.yaml')
|
||||
resource = Resource(entry.source, '', lambda: None, '.yaml')
|
||||
content = yaml.dump(entry.compile())
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector):
|
||||
logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
|
||||
continue
|
||||
try:
|
||||
res.download(self.args.max_wait_time)
|
||||
res.download()
|
||||
except errors.BulkDownloaderException as e:
|
||||
logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
|
||||
f'with downloader {downloader_class.__name__}: {e}')
|
||||
|
||||
@@ -6,7 +6,7 @@ import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
import _hashlib
|
||||
import requests
|
||||
@@ -18,40 +18,44 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Resource:
|
||||
def __init__(self, source_submission: Submission, url: str, extension: str = None):
|
||||
def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
|
||||
self.source_submission = source_submission
|
||||
self.content: Optional[bytes] = None
|
||||
self.url = url
|
||||
self.hash: Optional[_hashlib.HASH] = None
|
||||
self.extension = extension
|
||||
self.download_function = download_function
|
||||
if not self.extension:
|
||||
self.extension = self._determine_extension()
|
||||
|
||||
@staticmethod
|
||||
def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]:
|
||||
try:
|
||||
response = requests.get(url)
|
||||
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
|
||||
return response.content
|
||||
elif response.status_code in (408, 429):
|
||||
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
|
||||
else:
|
||||
raise BulkDownloaderException(
|
||||
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
|
||||
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
|
||||
time.sleep(current_wait_time)
|
||||
if current_wait_time < max_wait_time:
|
||||
current_wait_time += 60
|
||||
return Resource.retry_download(url, max_wait_time, current_wait_time)
|
||||
else:
|
||||
logger.error(f'Max wait time exceeded for resource at url {url}')
|
||||
raise
|
||||
def retry_download(url: str, max_wait_time: int) -> Callable:
|
||||
def http_download() -> Optional[bytes]:
|
||||
current_wait_time = 60
|
||||
while True:
|
||||
try:
|
||||
response = requests.get(url)
|
||||
if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
|
||||
return response.content
|
||||
elif response.status_code in (408, 429):
|
||||
raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
|
||||
else:
|
||||
raise BulkDownloaderException(
|
||||
f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
|
||||
logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
|
||||
time.sleep(current_wait_time)
|
||||
if current_wait_time < max_wait_time:
|
||||
current_wait_time += 60
|
||||
else:
|
||||
logger.error(f'Max wait time exceeded for resource at url {url}')
|
||||
raise
|
||||
return http_download
|
||||
|
||||
def download(self, max_wait_time: int):
|
||||
def download(self):
|
||||
if not self.content:
|
||||
try:
|
||||
content = self.retry_download(self.url, max_wait_time)
|
||||
content = self.download_function()
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
raise BulkDownloaderException(f'Could not download resource: {e}')
|
||||
except BulkDownloaderException:
|
||||
|
||||
@@ -14,4 +14,4 @@ class Direct(BaseDownloader):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
return [Resource(self.post, self.post.url)]
|
||||
return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url, 300))]
|
||||
|
||||
@@ -29,7 +29,7 @@ class Erome(BaseDownloader):
|
||||
for link in links:
|
||||
if not re.match(r'https?://.*', link):
|
||||
link = 'https://' + link
|
||||
out.append(Resource(self.post, link))
|
||||
out.append(Resource(self.post, link, Resource.retry_download(link, 300)))
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import youtube_dl
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.resource import Resource
|
||||
@@ -20,21 +19,18 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube):
|
||||
super(YoutubeDlFallback, self).__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = super()._download_video({})
|
||||
out = Resource(
|
||||
self.post,
|
||||
self.post.url,
|
||||
super()._download_video({}),
|
||||
super().get_video_attributes(self.post.url)['ext'],
|
||||
)
|
||||
return [out]
|
||||
|
||||
@staticmethod
|
||||
def can_handle_link(url: str) -> bool:
|
||||
yt_logger = logging.getLogger('youtube-dl')
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
with youtube_dl.YoutubeDL({
|
||||
'logger': yt_logger,
|
||||
}) as ydl:
|
||||
try:
|
||||
result = ydl.extract_info(url, download=False)
|
||||
if result:
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
return False
|
||||
return False
|
||||
attributes = YoutubeDlFallback.get_video_attributes(url)
|
||||
if attributes:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
@@ -31,7 +31,7 @@ class Gallery(BaseDownloader):
|
||||
|
||||
if not image_urls:
|
||||
raise SiteDownloaderError('No images found in Reddit gallery')
|
||||
return [Resource(self.post, url) for url in image_urls]
|
||||
return [Resource(self.post, url, Resource.retry_download(url, 300)) for url in image_urls]
|
||||
|
||||
@ staticmethod
|
||||
def _get_links(id_dict: list[dict]) -> list[str]:
|
||||
|
||||
@@ -33,7 +33,7 @@ class Imgur(BaseDownloader):
|
||||
|
||||
def _compute_image_url(self, image: dict) -> Resource:
|
||||
image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
|
||||
return Resource(self.post, image_url)
|
||||
return Resource(self.post, image_url, Resource.retry_download(image_url, 300))
|
||||
|
||||
@staticmethod
|
||||
def _get_data(link: str) -> dict:
|
||||
|
||||
@@ -22,5 +22,10 @@ class PornHub(Youtube):
|
||||
'format': 'best',
|
||||
'nooverwrites': True,
|
||||
}
|
||||
out = self._download_video(ytdl_options)
|
||||
out = Resource(
|
||||
self.post,
|
||||
self.post.url,
|
||||
super()._download_video(ytdl_options),
|
||||
super().get_video_attributes(self.post.url)['ext'],
|
||||
)
|
||||
return [out]
|
||||
|
||||
@@ -18,7 +18,7 @@ class Redgifs(BaseDownloader):
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
media_url = self._get_link(self.post.url)
|
||||
return [Resource(self.post, media_url, '.mp4')]
|
||||
return [Resource(self.post, media_url, Resource.retry_download(media_url, 300), '.mp4')]
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> str:
|
||||
|
||||
@@ -17,7 +17,7 @@ class SelfPost(BaseDownloader):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = Resource(self.post, self.post.url, '.txt')
|
||||
out = Resource(self.post, self.post.url, lambda: None, '.txt')
|
||||
out.content = self.export_to_string().encode('utf-8')
|
||||
out.create_hash()
|
||||
return [out]
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
import youtube_dl
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import (NotADownloadableLinkError, SiteDownloaderError)
|
||||
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
@@ -26,32 +26,47 @@ class Youtube(BaseDownloader):
|
||||
'playlistend': 1,
|
||||
'nooverwrites': True,
|
||||
}
|
||||
out = self._download_video(ytdl_options)
|
||||
return [out]
|
||||
download_function = self._download_video(ytdl_options)
|
||||
try:
|
||||
extension = self.get_video_attributes(self.post.url)['ext']
|
||||
except KeyError:
|
||||
raise NotADownloadableLinkError(f'Youtube-DL cannot download URL {self.post.url}')
|
||||
res = Resource(self.post, self.post.url, download_function, extension)
|
||||
return [res]
|
||||
|
||||
def _download_video(self, ytdl_options: dict) -> Resource:
|
||||
def _download_video(self, ytdl_options: dict) -> Callable:
|
||||
yt_logger = logging.getLogger('youtube-dl')
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
ytdl_options['quiet'] = True
|
||||
ytdl_options['logger'] = yt_logger
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
download_path = Path(temp_dir).resolve()
|
||||
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
|
||||
ydl.download([self.post.url])
|
||||
except youtube_dl.DownloadError as e:
|
||||
raise SiteDownloaderError(f'Youtube download failed: {e}')
|
||||
|
||||
downloaded_files = list(download_path.iterdir())
|
||||
if len(downloaded_files) > 0:
|
||||
downloaded_file = downloaded_files[0]
|
||||
else:
|
||||
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
|
||||
extension = downloaded_file.suffix
|
||||
with open(downloaded_file, 'rb') as file:
|
||||
content = file.read()
|
||||
out = Resource(self.post, self.post.url, extension)
|
||||
out.content = content
|
||||
out.create_hash()
|
||||
return out
|
||||
def download() -> bytes:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
download_path = Path(temp_dir).resolve()
|
||||
ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ytdl_options) as ydl:
|
||||
ydl.download([self.post.url])
|
||||
except youtube_dl.DownloadError as e:
|
||||
raise SiteDownloaderError(f'Youtube download failed: {e}')
|
||||
|
||||
downloaded_files = list(download_path.iterdir())
|
||||
if len(downloaded_files) > 0:
|
||||
downloaded_file = downloaded_files[0]
|
||||
else:
|
||||
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
|
||||
with open(downloaded_file, 'rb') as file:
|
||||
content = file.read()
|
||||
return content
|
||||
return download
|
||||
|
||||
@staticmethod
|
||||
def get_video_attributes(url: str) -> dict:
|
||||
yt_logger = logging.getLogger('youtube-dl')
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl:
|
||||
try:
|
||||
result = ydl.extract_info(url, download=False)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
|
||||
Reference in New Issue
Block a user