Update gallery code to work with NSFW galleries

This commit is contained in:
Serene-Arc
2021-06-25 14:00:10 +10:00
parent e8998da2f0
commit d53b3b7274
2 changed files with 50 additions and 43 deletions

View File

@@ -5,6 +5,7 @@ import re
from typing import Optional
import bs4
import requests
from praw.models import Submission
from bdfr.exceptions import SiteDownloaderError
@@ -20,21 +21,21 @@ class Gallery(BaseDownloader):
super().__init__(post)
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
image_urls = self._get_links(self.post.url)
image_urls = self._get_links(self.post.gallery_data['items'])
if not image_urls:
raise SiteDownloaderError('No images found in Reddit gallery')
return [Resource(self.post, url) for url in image_urls]
@ staticmethod
def _get_links(url: str) -> list[str]:
resource_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.87 Safari/537.36 OPR/54.0.2952.64',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
page = Gallery.retrieve_url(url, headers=resource_headers)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('a', attrs={'target': '_blank', 'href': re.compile(r'https://preview\.redd\.it.*')})
links = [link.get('href') for link in links]
return links
def _get_links(id_dict: list[dict]) -> list[str]:
out = []
for item in id_dict:
image_id = item['media_id']
possible_extensions = ('.jpg', '.png', '.gif', '.gifv', '.jpeg')
for extension in possible_extensions:
test_url = f'https://i.redd.it/{image_id}{extension}'
response = requests.head(test_url)
if response.status_code == 200:
out.append(test_url)
break
return out

View File

@@ -8,30 +8,32 @@ from bdfr.site_downloaders.gallery import Gallery
@pytest.mark.online
@pytest.mark.parametrize(('test_url', 'expected'), (
('https://www.reddit.com/gallery/m6lvrh', {
'https://preview.redd.it/18nzv9ch0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=470a825b9c364e0eace0036882dcff926f821de8',
'https://preview.redd.it/jqkizcch0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=ae4f552a18066bb6727676b14f2451c5feecf805',
'https://preview.redd.it/k0fnqzbh0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=c6a10fececdc33983487c16ad02219fd3fc6cd76',
'https://preview.redd.it/m3gamzbh0hn61.jpg?width=4160&'
'format=pjpg&auto=webp&s=0dd90f324711851953e24873290b7f29ec73c444'
@pytest.mark.parametrize(('test_ids', 'expected'), (
([
{'media_id': '18nzv9ch0hn61'},
{'media_id': 'jqkizcch0hn61'},
{'media_id': 'k0fnqzbh0hn61'},
{'media_id': 'm3gamzbh0hn61'},
], {
'https://i.redd.it/18nzv9ch0hn61.jpg',
'https://i.redd.it/jqkizcch0hn61.jpg',
'https://i.redd.it/k0fnqzbh0hn61.jpg',
'https://i.redd.it/m3gamzbh0hn61.jpg'
}),
('https://www.reddit.com/gallery/ljyy27', {
'https://preview.redd.it/04vxj25uqih61.png?width=92&'
'format=png&auto=webp&s=6513f3a5c5128ee7680d402cab5ea4fb2bbeead4',
'https://preview.redd.it/0fnx83kpqih61.png?width=241&'
'format=png&auto=webp&s=655e9deb6f499c9ba1476eaff56787a697e6255a',
'https://preview.redd.it/7zkmr1wqqih61.png?width=237&'
'format=png&auto=webp&s=19de214e634cbcad9959f19570c616e29be0c0b0',
'https://preview.redd.it/u37k5gxrqih61.png?width=443&'
'format=png&auto=webp&s=e74dae31841fe4a2545ffd794d3b25b9ff0eb862'
([
{'media_id': '04vxj25uqih61'},
{'media_id': '0fnx83kpqih61'},
{'media_id': '7zkmr1wqqih61'},
{'media_id': 'u37k5gxrqih61'},
], {
'https://i.redd.it/04vxj25uqih61.png',
'https://i.redd.it/0fnx83kpqih61.png',
'https://i.redd.it/7zkmr1wqqih61.png',
'https://i.redd.it/u37k5gxrqih61.png'
}),
))
def test_gallery_get_links(test_url: str, expected: set[str]):
results = Gallery._get_links(test_url)
def test_gallery_get_links(test_ids: list[dict], expected: set[str]):
results = Gallery._get_links(test_ids)
assert set(results) == expected
@@ -39,16 +41,20 @@ def test_gallery_get_links(test_url: str, expected: set[str]):
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'expected_hashes'), (
('m6lvrh', {
'6c8a892ae8066cbe119218bcaac731e1',
'93ce177f8cb7994906795f4615114d13',
'9a293adf19354f14582608cf22124574',
'b73e2c3daee02f99404644ea02f1ae65'
'5c42b8341dd56eebef792e86f3981c6a',
'8f38d76da46f4057bf2773a778e725ca',
'f5776f8f90491c8b770b8e0a6bfa49b3',
'fa1a43c94da30026ad19a9813a0ed2c2',
}),
('ljyy27', {
'1bc38bed88f9c4770e22a37122d5c941',
'2539a92b78f3968a069df2dffe2279f9',
'37dea50281c219b905e46edeefc1a18d',
'ec4924cf40549728dcf53dd40bc7a73c'
'359c203ec81d0bc00e675f1023673238',
'79262fd46bce5bfa550d878a3b898be4',
'808c35267f44acb523ce03bfa5687404',
'ec8b65bdb7f1279c4b3af0ea2bbb30c3',
}),
('nxyahw', {
'b89a3f41feb73ec1136ec4ffa7353eb1',
'cabb76fd6fd11ae6e115a2039eb09f04',
}),
))
def test_gallery_download(test_submission_id: str, expected_hashes: set[str], reddit_instance: praw.Reddit):