From b74e93d2b792b0a97310e2a778f90fa51934c35c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 18:51:00 +1000 Subject: [PATCH 1/4] Fix typo in test name --- tests/site_downloaders/test_download_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index f02e9f7..f89df8a 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -69,6 +69,6 @@ def test_factory_lever_bad(test_url: str): ('https://youtube.com/watch?v=Gv8Wz74FjVA', 'youtube.com/watch'), ('https://i.imgur.com/BuzvZwb.gifv', 'i.imgur.com/BuzvZwb.gifv'), )) -def test_sanitise_urll(test_url: str, expected: str): +def test_sanitise_url(test_url: str, expected: str): result = DownloadFactory._sanitise_url(test_url) assert result == expected From f47688812d1755b58105cd5400205e3f75410441 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 18:51:24 +1000 Subject: [PATCH 2/4] Rename function --- bdfr/site_downloaders/download_factory.py | 4 ++-- tests/site_downloaders/test_download_factory.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 7035dc2..8eff2b8 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -21,7 +21,7 @@ from bdfr.site_downloaders.youtube import Youtube class DownloadFactory: @staticmethod def pull_lever(url: str) -> Type[BaseDownloader]: - sanitised_url = DownloadFactory._sanitise_url(url) + sanitised_url = DownloadFactory.sanitise_url(url) if re.match(r'(i\.)?imgur.*\.gifv$', sanitised_url): return Imgur elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url): @@ -49,7 +49,7 @@ class DownloadFactory: f'No downloader module exists for url {url}') @staticmethod - def _sanitise_url(url: str) -> str: + def sanitise_url(url: str) -> str: beginning_regex = re.compile(r'\s*(www\.?)?') split_url = urllib.parse.urlsplit(url) split_url = split_url.netloc + split_url.path diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index f89df8a..d5e84d8 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -70,5 +70,5 @@ def test_factory_lever_bad(test_url: str): ('https://i.imgur.com/BuzvZwb.gifv', 'i.imgur.com/BuzvZwb.gifv'), )) def test_sanitise_url(test_url: str, expected: str): - result = DownloadFactory._sanitise_url(test_url) + result = DownloadFactory.sanitise_url(test_url) assert result == expected From 87959028e5a0ffffe545b821660fa53e6f50dfe0 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 18:59:32 +1000 Subject: [PATCH 3/4] Add blacklist for web filetypes --- bdfr/site_downloaders/download_factory.py | 20 ++++++++++++++++++- .../site_downloaders/test_download_factory.py | 13 ++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 8eff2b8..cbfee2d 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -24,7 +24,8 @@ class DownloadFactory: sanitised_url = DownloadFactory.sanitise_url(url) if re.match(r'(i\.)?imgur.*\.gifv$', sanitised_url): return Imgur - elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url): + elif re.match(r'.*/.*\.\w{3,4}(\?[\w;&=]*)?$', sanitised_url) and \ + not DownloadFactory.is_web_resource(sanitised_url): return Direct elif re.match(r'erome\.com.*', sanitised_url): return Erome @@ -55,3 +56,20 @@ class DownloadFactory: split_url = split_url.netloc + split_url.path split_url = re.sub(beginning_regex, '', split_url) return split_url + + @staticmethod + def is_web_resource(url: str) -> bool: + web_extensions = ( + 'asp', + 'cfm', + 'cfml', + 'css', + 'html', + 'js', + 'php', + 'xhtml', + ) + if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url): + return True + else: + return False diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index d5e84d8..4b5356c 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -72,3 +72,16 @@ def test_factory_lever_bad(test_url: str): def test_sanitise_url(test_url: str, expected: str): result = DownloadFactory.sanitise_url(test_url) assert result == expected + + +@pytest.mark.parametrize(('test_url', 'expected'), ( + ('www.example.com/test.asp', True), + ('www.example.com/test.html', True), + ('www.example.com/test.js', True), + ('www.example.com/test.xhtml', True), + ('www.example.com/test.mp4', False), + ('www.example.com/test.png', False), +)) +def test_is_web_resource(test_url: str, expected: bool): + result = DownloadFactory.is_web_resource(test_url) + assert result == expected From fef2fc864bb75f601253a590d8728cce89cd89db Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 25 May 2021 19:33:32 +1000 Subject: [PATCH 4/4] Update blacklist --- bdfr/site_downloaders/download_factory.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index cbfee2d..41813f9 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -61,12 +61,15 @@ class DownloadFactory: def is_web_resource(url: str) -> bool: web_extensions = ( 'asp', + 'aspx', 'cfm', 'cfml', 'css', + 'htm', 'html', 'js', 'php', + 'php3', 'xhtml', ) if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url):