Implement callbacks for downloading

2021-07-27 13:39:49 +10:00
parent 44453b1707
commit 3cdae99490
23 changed files with 112 additions and 92 deletions
--- a/bdfr/archiver.py
+++ b/bdfr/archiver.py
@@ -76,17 +76,17 @@ class Archiver(RedditConnector):
        logger.info(f'Record for entry item {praw_item.id} written to disk')

    def _write_entry_json(self, entry: BaseArchiveEntry):
-        resource = Resource(entry.source, '', '.json')
+        resource = Resource(entry.source, '', lambda: None, '.json')
        content = json.dumps(entry.compile())
        self._write_content_to_disk(resource, content)

    def _write_entry_xml(self, entry: BaseArchiveEntry):
-        resource = Resource(entry.source, '', '.xml')
+        resource = Resource(entry.source, '', lambda: None, '.xml')
        content = dict2xml.dict2xml(entry.compile(), wrap='root')
        self._write_content_to_disk(resource, content)

    def _write_entry_yaml(self, entry: BaseArchiveEntry):
-        resource = Resource(entry.source, '', '.yaml')
+        resource = Resource(entry.source, '', lambda: None, '.yaml')
        content = yaml.dump(entry.compile())
        self._write_content_to_disk(resource, content)

--- a/bdfr/downloader.py
+++ b/bdfr/downloader.py
@@ -82,7 +82,7 @@ class RedditDownloader(RedditConnector):
                logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
                continue
            try:
-                res.download(self.args.max_wait_time)
+                res.download()
            except errors.BulkDownloaderException as e:
                logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
                             f'with downloader {downloader_class.__name__}: {e}')
--- a/bdfr/resource.py
+++ b/bdfr/resource.py
@@ -6,7 +6,7 @@ import logging
 import re
 import time
 import urllib.parse
-from typing import Optional
+from typing import Callable, Optional

 import _hashlib
 import requests
@@ -18,40 +18,44 @@ logger = logging.getLogger(__name__)


 class Resource:
-    def __init__(self, source_submission: Submission, url: str, extension: str = None):
+    def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
        self.source_submission = source_submission
        self.content: Optional[bytes] = None
        self.url = url
        self.hash: Optional[_hashlib.HASH] = None
        self.extension = extension
+        self.download_function = download_function
        if not self.extension:
            self.extension = self._determine_extension()

    @staticmethod
-    def retry_download(url: str, max_wait_time: int, current_wait_time: int = 60) -> Optional[bytes]:
-        try:
-            response = requests.get(url)
-            if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
-                return response.content
-            elif response.status_code in (408, 429):
-                raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
-            else:
-                raise BulkDownloaderException(
-                    f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
-        except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
-            logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
-            time.sleep(current_wait_time)
-            if current_wait_time < max_wait_time:
-                current_wait_time += 60
-                return Resource.retry_download(url, max_wait_time, current_wait_time)
-            else:
-                logger.error(f'Max wait time exceeded for resource at url {url}')
-                raise
+    def retry_download(url: str, max_wait_time: int) -> Callable:
+        def http_download() -> Optional[bytes]:
+            current_wait_time = 60
+            while True:
+                try:
+                    response = requests.get(url)
+                    if re.match(r'^2\d{2}', str(response.status_code)) and response.content:
+                        return response.content
+                    elif response.status_code in (408, 429):
+                        raise requests.exceptions.ConnectionError(f'Response code {response.status_code}')
+                    else:
+                        raise BulkDownloaderException(
+                            f'Unrecoverable error requesting resource: HTTP Code {response.status_code}')
+                except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
+                    logger.warning(f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}')
+                    time.sleep(current_wait_time)
+                    if current_wait_time < max_wait_time:
+                        current_wait_time += 60
+                    else:
+                        logger.error(f'Max wait time exceeded for resource at url {url}')
+                        raise
+        return http_download

-    def download(self, max_wait_time: int):
+    def download(self):
        if not self.content:
            try:
-                content = self.retry_download(self.url, max_wait_time)
+                content = self.download_function()
            except requests.exceptions.ConnectionError as e:
                raise BulkDownloaderException(f'Could not download resource: {e}')
            except BulkDownloaderException:
--- a/bdfr/site_downloaders/direct.py
+++ b/bdfr/site_downloaders/direct.py
@@ -14,4 +14,4 @@ class Direct(BaseDownloader):
        super().__init__(post)

    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
-        return [Resource(self.post, self.post.url)]
+        return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url, 300))]
--- a/bdfr/site_downloaders/erome.py
+++ b/bdfr/site_downloaders/erome.py
@@ -29,7 +29,7 @@ class Erome(BaseDownloader):
        for link in links:
            if not re.match(r'https?://.*', link):
                link = 'https://' + link
-            out.append(Resource(self.post, link))
+            out.append(Resource(self.post, link, Resource.retry_download(link, 300)))
        return out

    @staticmethod
--- a/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py
+++ b/bdfr/site_downloaders/fallback_downloaders/youtubedl_fallback.py
@@ -4,7 +4,6 @@
 import logging
 from typing import Optional

-import youtube_dl
 from praw.models import Submission

 from bdfr.resource import Resource
@@ -20,21 +19,18 @@ class YoutubeDlFallback(BaseFallbackDownloader, Youtube):
        super(YoutubeDlFallback, self).__init__(post)

    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
-        out = super()._download_video({})
+        out = Resource(
+            self.post,
+            self.post.url,
+            super()._download_video({}),
+            super().get_video_attributes(self.post.url)['ext'],
+        )
        return [out]

    @staticmethod
    def can_handle_link(url: str) -> bool:
-        yt_logger = logging.getLogger('youtube-dl')
-        yt_logger.setLevel(logging.CRITICAL)
-        with youtube_dl.YoutubeDL({
-            'logger': yt_logger,
-        }) as ydl:
-            try:
-                result = ydl.extract_info(url, download=False)
-                if result:
-                    return True
-            except Exception as e:
-                logger.exception(e)
-                return False
-        return False
+        attributes = YoutubeDlFallback.get_video_attributes(url)
+        if attributes:
+            return True
+        else:
+            return False
--- a/bdfr/site_downloaders/gallery.py
+++ b/bdfr/site_downloaders/gallery.py
@@ -31,7 +31,7 @@ class Gallery(BaseDownloader):

        if not image_urls:
            raise SiteDownloaderError('No images found in Reddit gallery')
-        return [Resource(self.post, url) for url in image_urls]
+        return [Resource(self.post, url, Resource.retry_download(url, 300)) for url in image_urls]

    @ staticmethod
    def _get_links(id_dict: list[dict]) -> list[str]:
--- a/bdfr/site_downloaders/imgur.py
+++ b/bdfr/site_downloaders/imgur.py
@@ -33,7 +33,7 @@ class Imgur(BaseDownloader):

    def _compute_image_url(self, image: dict) -> Resource:
        image_url = 'https://i.imgur.com/' + image['hash'] + self._validate_extension(image['ext'])
-        return Resource(self.post, image_url)
+        return Resource(self.post, image_url, Resource.retry_download(image_url, 300))

    @staticmethod
    def _get_data(link: str) -> dict:
--- a/bdfr/site_downloaders/pornhub.py
+++ b/bdfr/site_downloaders/pornhub.py
@@ -22,5 +22,10 @@ class PornHub(Youtube):
            'format': 'best',
            'nooverwrites': True,
        }
-        out = self._download_video(ytdl_options)
+        out = Resource(
+            self.post,
+            self.post.url,
+            super()._download_video(ytdl_options),
+            super().get_video_attributes(self.post.url)['ext'],
+        )
        return [out]
--- a/bdfr/site_downloaders/redgifs.py
+++ b/bdfr/site_downloaders/redgifs.py
@@ -18,7 +18,7 @@ class Redgifs(BaseDownloader):

    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
        media_url = self._get_link(self.post.url)
-        return [Resource(self.post, media_url, '.mp4')]
+        return [Resource(self.post, media_url, Resource.retry_download(media_url, 300), '.mp4')]

    @staticmethod
    def _get_link(url: str) -> str:
--- a/bdfr/site_downloaders/self_post.py
+++ b/bdfr/site_downloaders/self_post.py
@@ -17,7 +17,7 @@ class SelfPost(BaseDownloader):
        super().__init__(post)

    def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
-        out = Resource(self.post, self.post.url, '.txt')
+        out = Resource(self.post, self.post.url, lambda: None, '.txt')
        out.content = self.export_to_string().encode('utf-8')
        out.create_hash()
        return [out]
--- a/bdfr/site_downloaders/youtube.py
+++ b/bdfr/site_downloaders/youtube.py
@@ -3,12 +3,12 @@
 import logging
 import tempfile
 from pathlib import Path
-from typing import Optional
+from typing import Callable, Optional

 import youtube_dl
 from praw.models import Submission

-from bdfr.exceptions import (NotADownloadableLinkError, SiteDownloaderError)
+from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
 from bdfr.resource import Resource
 from bdfr.site_authenticator import SiteAuthenticator
 from bdfr.site_downloaders.base_downloader import BaseDownloader
@@ -26,32 +26,47 @@ class Youtube(BaseDownloader):
            'playlistend': 1,
            'nooverwrites': True,
        }
-        out = self._download_video(ytdl_options)
-        return [out]
+        download_function = self._download_video(ytdl_options)
+        try:
+            extension = self.get_video_attributes(self.post.url)['ext']
+        except KeyError:
+            raise NotADownloadableLinkError(f'Youtube-DL cannot download URL {self.post.url}')
+        res = Resource(self.post, self.post.url, download_function, extension)
+        return [res]

-    def _download_video(self, ytdl_options: dict) -> Resource:
+    def _download_video(self, ytdl_options: dict) -> Callable:
        yt_logger = logging.getLogger('youtube-dl')
        yt_logger.setLevel(logging.CRITICAL)
        ytdl_options['quiet'] = True
        ytdl_options['logger'] = yt_logger
-        with tempfile.TemporaryDirectory() as temp_dir:
-            download_path = Path(temp_dir).resolve()
-            ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
-            try:
-                with youtube_dl.YoutubeDL(ytdl_options) as ydl:
-                    ydl.download([self.post.url])
-            except youtube_dl.DownloadError as e:
-                raise SiteDownloaderError(f'Youtube download failed: {e}')

-            downloaded_files = list(download_path.iterdir())
-            if len(downloaded_files) > 0:
-                downloaded_file = downloaded_files[0]
-            else:
-                raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
-            extension = downloaded_file.suffix
-            with open(downloaded_file, 'rb') as file:
-                content = file.read()
-        out = Resource(self.post, self.post.url, extension)
-        out.content = content
-        out.create_hash()
-        return out
+        def download() -> bytes:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                download_path = Path(temp_dir).resolve()
+                ytdl_options['outtmpl'] = str(download_path) + '/' + 'test.%(ext)s'
+                try:
+                    with youtube_dl.YoutubeDL(ytdl_options) as ydl:
+                        ydl.download([self.post.url])
+                except youtube_dl.DownloadError as e:
+                    raise SiteDownloaderError(f'Youtube download failed: {e}')
+
+                downloaded_files = list(download_path.iterdir())
+                if len(downloaded_files) > 0:
+                    downloaded_file = downloaded_files[0]
+                else:
+                    raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
+                with open(downloaded_file, 'rb') as file:
+                    content = file.read()
+                return content
+        return download
+
+    @staticmethod
+    def get_video_attributes(url: str) -> dict:
+        yt_logger = logging.getLogger('youtube-dl')
+        yt_logger.setLevel(logging.CRITICAL)
+        with youtube_dl.YoutubeDL({'logger': yt_logger, }) as ydl:
+            try:
+                result = ydl.extract_info(url, download=False)
+                return result
+            except Exception as e:
+                logger.exception(e)