Update gallery code to work with NSFW galleries

This commit is contained in:
Serene-Arc
2021-06-25 14:00:10 +10:00
parent e8998da2f0
commit d53b3b7274
2 changed files with 50 additions and 43 deletions

View File

@@ -5,6 +5,7 @@ import re
from typing import Optional from typing import Optional
import bs4 import bs4
import requests
from praw.models import Submission from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError from bdfr.exceptions import SiteDownloaderError
@@ -20,21 +21,21 @@ class Gallery(BaseDownloader):
super().__init__(post) super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
image_urls = self._get_links(self.post.url) image_urls = self._get_links(self.post.gallery_data['items'])
if not image_urls: if not image_urls:
raise SiteDownloaderError('No images found in Reddit gallery') raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls] return [Resource(self.post, url) for url in image_urls]
@staticmethod @ staticmethod
def _get_links(url: str) -> list[str]: def _get_links(id_dict: list[dict]) -> list[str]:
resource_headers = { out = []
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' for item in id_dict:
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64', image_id = item['media_id']
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', possible_extensions = ('.jpg', '.png', '.gif', '.gifv', '.jpeg')
} for extension in possible_extensions:
page = Gallery.retrieve_url(url, headers=resource_headers) test_url = f'https://i.redd.it/{image_id}{extension}'
soup = bs4.BeautifulSoup(page.text, 'html.parser') response = requests.head(test_url)
if response.status_code == 200:
links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')}) out.append(test_url)
links = [link.get('href') for link in links] break
return links return out

View File

@@ -8,30 +8,32 @@ from bdfr.site_downloaders.gallery import Gallery
@pytest.mark.online @pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected'), ( @pytest.mark.parametrize(('test_ids', 'expected'), (
('https://www.reddit.com/gallery/m6lvrh', { ([
'https://preview.redd.it/18nzv9ch0hn61.jpg?width=4160&' {'media_id': '18nzv9ch0hn61'},
'format=pjpg&auto=webp&s=470a825b9c364e0eace0036882dcff926f821de8', {'media_id': 'jqkizcch0hn61'},
'https://preview.redd.it/jqkizcch0hn61.jpg?width=4160&' {'media_id': 'k0fnqzbh0hn61'},
'format=pjpg&auto=webp&s=ae4f552a18066bb6727676b14f2451c5feecf805', {'media_id': 'm3gamzbh0hn61'},
'https://preview.redd.it/k0fnqzbh0hn61.jpg?width=4160&' ], {
'format=pjpg&auto=webp&s=c6a10fececdc33983487c16ad02219fd3fc6cd76', 'https://i.redd.it/18nzv9ch0hn61.jpg',
'https://preview.redd.it/m3gamzbh0hn61.jpg?width=4160&' 'https://i.redd.it/jqkizcch0hn61.jpg',
'format=pjpg&auto=webp&s=0dd90f324711851953e24873290b7f29ec73c444' 'https://i.redd.it/k0fnqzbh0hn61.jpg',
'https://i.redd.it/m3gamzbh0hn61.jpg'
}), }),
('https://www.reddit.com/gallery/ljyy27', { ([
'https://preview.redd.it/04vxj25uqih61.png?width=92&' {'media_id': '04vxj25uqih61'},
'format=png&auto=webp&s=6513f3a5c5128ee7680d402cab5ea4fb2bbeead4', {'media_id': '0fnx83kpqih61'},
'https://preview.redd.it/0fnx83kpqih61.png?width=241&' {'media_id': '7zkmr1wqqih61'},
'format=png&auto=webp&s=655e9deb6f499c9ba1476eaff56787a697e6255a', {'media_id': 'u37k5gxrqih61'},
'https://preview.redd.it/7zkmr1wqqih61.png?width=237&' ], {
'format=png&auto=webp&s=19de214e634cbcad9959f19570c616e29be0c0b0', 'https://i.redd.it/04vxj25uqih61.png',
'https://preview.redd.it/u37k5gxrqih61.png?width=443&' 'https://i.redd.it/0fnx83kpqih61.png',
'format=png&auto=webp&s=e74dae31841fe4a2545ffd794d3b25b9ff0eb862' 'https://i.redd.it/7zkmr1wqqih61.png',
'https://i.redd.it/u37k5gxrqih61.png'
}), }),
)) ))
def test_gallery_get_links(test_url: str, expected: set[str]): def test_gallery_get_links(test_ids: list[dict], expected: set[str]):
results = Gallery._get_links(test_url) results = Gallery._get_links(test_ids)
assert set(results) == expected assert set(results) == expected
@@ -39,16 +41,20 @@ def test_gallery_get_links(test_url: str, expected: set[str]):
@pytest.mark.reddit @pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), ( @pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), (
('m6lvrh', { ('m6lvrh', {
'6c8a892ae8066cbe119218bcaac731e1', '5c42b8341dd56eebef792e86f3981c6a',
'93ce177f8cb7994906795f4615114d13', '8f38d76da46f4057bf2773a778e725ca',
'9a293adf19354f14582608cf22124574', 'f5776f8f90491c8b770b8e0a6bfa49b3',
'b73e2c3daee02f99404644ea02f1ae65' 'fa1a43c94da30026ad19a9813a0ed2c2',
}), }),
('ljyy27', { ('ljyy27', {
'1bc38bed88f9c4770e22a37122d5c941', '359c203ec81d0bc00e675f1023673238',
'2539a92b78f3968a069df2dffe2279f9', '79262fd46bce5bfa550d878a3b898be4',
'37dea50281c219b905e46edeefc1a18d', '808c35267f44acb523ce03bfa5687404',
'ec4924cf40549728dcf53dd40bc7a73c' 'ec8b65bdb7f1279c4b3af0ea2bbb30c3',
}),
('nxyahw', {
'b89a3f41feb73ec1136ec4ffa7353eb1',
'cabb76fd6fd11ae6e115a2039eb09f04',
}), }),
)) ))
def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit): def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit):