From 228cd5f687736112d097880bd741172a46a8ba5e Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Fri, 26 Feb 2021 18:56:05 +1000 Subject: [PATCH] Change Resource model --- bulkredditdownloader/resource.py | 53 ++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/bulkredditdownloader/resource.py b/bulkredditdownloader/resource.py index cf0ed90..e408de2 100644 --- a/bulkredditdownloader/resource.py +++ b/bulkredditdownloader/resource.py @@ -3,25 +3,52 @@ import hashlib import re +import time +from typing import Optional +import _hashlib +import requests from praw.models import Submission +from bulkredditdownloader.errors import BulkDownloaderException + class Resource: - def __init__(self, source_submission: Submission, url: str, content: bytes): + def __init__(self, source_submission: Submission, url: str, extension: str = None): self.source_submission = source_submission - self.content = content + self.content: Optional[bytes] = None self.url = url - self.hash = hashlib.md5(content) - self.extension = self._get_extension(url) + self.hash: Optional[_hashlib.HASH] = None + self.extension = extension + if not self.extension: + self.extension = self._determine_extension() @staticmethod - def _get_extension(url: str) -> str: - pattern = re.compile(r'(\.(jpg|jpeg|png|mp4|webm|gif))') - if results := re.search(pattern, url): - if len(results.groups()) > 1: - return results[0] - if "v.redd.it" not in url: - return '.jpg' - else: - return '.mp4' + def retry_download(url: str, wait_time: int) -> Optional[bytes]: + try: + response = requests.get(url) + if response.status_code == 200: + return response.content + else: + raise requests.exceptions.ConnectionError + except requests.exceptions.ConnectionError: + time.sleep(wait_time) + if wait_time < 300: + return Resource.retry_download(url, wait_time + 60) + else: + return None + + def download(self): + if not self.content: + content = self.retry_download(self.url, 0) + if content: + self.content = content + self.hash = hashlib.md5(self.content) + else: + raise BulkDownloaderException('Could not download resource') + + def _determine_extension(self) -> str: + extension_pattern = r'.*(\..{3,5})$' + match = re.search(extension_pattern, self.url) + if match: + return match.group(1)