From e35a663f6f36949dc8b8063c878e1ca6fa9ce2b0 Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Sun, 17 Nov 2024 03:44:43 +0800 Subject: [PATCH 01/11] enhance download_and_extract Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index c2e17d3247..cb9f885352 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -327,7 +327,18 @@ def download_and_extract( be False. progress: whether to display progress bar. """ - with tempfile.TemporaryDirectory() as tmp_dir: - filename = filepath or Path(tmp_dir, _basename(url)).resolve() - download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) - extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) + urlFilenameExtension = ''.join(Path(".", _basename(url)).resolve().suffixes) + if filepath: + FilepathExtenstion = ''.join(Path(".", _basename(filepath)).resolve().suffixes) + if urlFilenameExtension != FilepathExtenstion: + raise NotImplementedError( + f'The file types do not match: url={urlFilenameExtension}, but filepath={FilepathExtenstion}' + ) + else: + with tempfile.TemporaryDirectory() as tmp_dir: + if filepath: + filename = filepath + else: + filename = Path(tmp_dir, _basename(url)).resolve() + download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) + extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) From 9d3c395837becda6eea347dbda500d276ea2903c Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Sun, 17 Nov 2024 04:38:07 +0800 Subject: [PATCH 02/11] fix commit Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index cb9f885352..742898f53f 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -334,11 +334,11 @@ def download_and_extract( raise NotImplementedError( f'The file types do not match: url={urlFilenameExtension}, but filepath={FilepathExtenstion}' ) - else: - with tempfile.TemporaryDirectory() as tmp_dir: - if filepath: - filename = filepath - else: - filename = Path(tmp_dir, _basename(url)).resolve() - download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) - extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) + with tempfile.TemporaryDirectory() as tmp_dir: + if filepath: + filename = filepath + else: + filename = Path(tmp_dir, _basename(url)).resolve() + filename = filepath if filepath else Path(tmp_dir, _basename(url)).resolve() + download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) + extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) \ No newline at end of file From 0441871a23900400d56aa403bcd5fa3fe24c07b0 Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Sun, 17 Nov 2024 04:40:58 +0800 Subject: [PATCH 03/11] fix some problem Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index 742898f53f..f2d91de399 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -339,6 +339,5 @@ def download_and_extract( filename = filepath else: filename = Path(tmp_dir, _basename(url)).resolve() - filename = filepath if filepath else Path(tmp_dir, _basename(url)).resolve() download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) - extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) \ No newline at end of file + extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) From a9a01715480caa6d45a2275e328b0f11ed5e17dd Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Thu, 5 Dec 2024 00:14:27 +0800 Subject: [PATCH 04/11] Enhance download_and_extract Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index f2d91de399..660c34699e 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -327,17 +327,18 @@ def download_and_extract( be False. progress: whether to display progress bar. """ - urlFilenameExtension = ''.join(Path(".", _basename(url)).resolve().suffixes) - if filepath: - FilepathExtenstion = ''.join(Path(".", _basename(filepath)).resolve().suffixes) - if urlFilenameExtension != FilepathExtenstion: - raise NotImplementedError( - f'The file types do not match: url={urlFilenameExtension}, but filepath={FilepathExtenstion}' + url_filename_ext = "".join(Path(".", _basename(url)).resolve().suffixes) + filepath_ext = "".join(Path(".", _basename(filepath)).resolve().suffixes) + if filepath not in ["", "."]: + if filepath_ext == "": + new_filepath = filepath + url_filename_ext + logger.warning( + f"filepath={filepath}, which missing file extension. Auto-appending extension to: {new_filepath}" ) + filepath = new_filepath + if filepath_ext and filepath_ext != url_filename_ext: + logger.warning(f"Expected extension {url_filename_ext}, but get {filepath_ext}, may cause unexpected errors!") with tempfile.TemporaryDirectory() as tmp_dir: - if filepath: - filename = filepath - else: - filename = Path(tmp_dir, _basename(url)).resolve() + filename = filepath or Path(tmp_dir, _basename(url)).resolve() download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) From e70e59c4d52a6b6aca808e81bc28c2988a080751 Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Sun, 15 Dec 2024 19:04:38 +0800 Subject: [PATCH 05/11] Enhance download_and_extract Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 50 +++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index 660c34699e..f79362afee 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -15,6 +15,7 @@ import json import logging import os +import re import shutil import sys import tarfile @@ -24,9 +25,11 @@ from pathlib import Path from typing import TYPE_CHECKING, Any from urllib.error import ContentTooShortError, HTTPError, URLError -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse from urllib.request import urlopen, urlretrieve +import requests + from monai.config.type_definitions import PathLike from monai.utils import look_up_option, min_version, optional_import @@ -298,6 +301,20 @@ def extractall( ) +def get_filename_from_url(data_url: str): + try: + response = requests.head(data_url, allow_redirects=True) + content_disposition = response.headers.get("Content-Disposition") + if content_disposition: + filename = re.findall("filename=(.+)", content_disposition) + return filename[0].strip('"').strip("'") + else: + filename = _basename(data_url) + return filename + except Exception as e: + raise Exception(f"Error processing URL: {e}") + + def download_and_extract( url: str, filepath: PathLike = "", @@ -327,18 +344,21 @@ def download_and_extract( be False. progress: whether to display progress bar. """ - url_filename_ext = "".join(Path(".", _basename(url)).resolve().suffixes) - filepath_ext = "".join(Path(".", _basename(filepath)).resolve().suffixes) - if filepath not in ["", "."]: - if filepath_ext == "": - new_filepath = filepath + url_filename_ext - logger.warning( - f"filepath={filepath}, which missing file extension. Auto-appending extension to: {new_filepath}" - ) - filepath = new_filepath - if filepath_ext and filepath_ext != url_filename_ext: - logger.warning(f"Expected extension {url_filename_ext}, but get {filepath_ext}, may cause unexpected errors!") with tempfile.TemporaryDirectory() as tmp_dir: - filename = filepath or Path(tmp_dir, _basename(url)).resolve() - download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) - extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) + if not filepath: + filename = get_filename_from_url(url) + full_path = Path(tmp_dir, filename) + elif os.path.isdir(filepath) or not os.path.splitext(filepath)[1]: + filename = get_filename_from_url(url) + full_path = Path(os.path.join(filepath, filename)) + logger.warning(f"No compress file extension provided, downloading as: '{full_path}'") + else: + url_filename_ext = "".join(Path(".", _basename(url)).resolve().suffixes) + filepath_ext = "".join(Path(".", _basename(filepath)).resolve().suffixes) + if filepath_ext != url_filename_ext: + raise ValueError( + f"File extension mismatch: expected extension {url_filename_ext}, but get {filepath_ext}" + ) + full_path = Path(filepath) + download_url(url=url, filepath=full_path, hash_val=hash_val, hash_type=hash_type, progress=progress) + extractall(filepath=full_path, output_dir=output_dir, file_type=file_type, has_base=has_base) From 7a26dcd3fab835faa0a9c7803c43bb762b6af71d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Dec 2024 11:05:49 +0000 Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- monai/apps/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index f79362afee..d946301390 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -25,7 +25,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any from urllib.error import ContentTooShortError, HTTPError, URLError -from urllib.parse import unquote, urlparse +from urllib.parse import urlparse from urllib.request import urlopen, urlretrieve import requests From 8143ac3df102118b105059dd7cc47a12d54ea448 Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Thu, 19 Dec 2024 00:01:10 +0800 Subject: [PATCH 07/11] Enhance download_and_extract Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 64 +++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index d946301390..3e5f29ba56 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -28,12 +28,12 @@ from urllib.parse import urlparse from urllib.request import urlopen, urlretrieve -import requests - from monai.config.type_definitions import PathLike from monai.utils import look_up_option, min_version, optional_import +requests, has_requests = optional_import("requests") gdown, has_gdown = optional_import("gdown", "4.7.3") +BeautifulSoup, has_bs4 = optional_import("bs4", name="BeautifulSoup") if TYPE_CHECKING: from tqdm import tqdm @@ -303,14 +303,29 @@ def extractall( def get_filename_from_url(data_url: str): try: - response = requests.head(data_url, allow_redirects=True) - content_disposition = response.headers.get("Content-Disposition") - if content_disposition: - filename = re.findall("filename=(.+)", content_disposition) - return filename[0].strip('"').strip("'") + if "drive.google.com" in data_url: + response = requests.head(data_url, allow_redirects=True) + cd = response.headers.get("Content-Disposition") # Normal size file case + if cd: + filename = cd.split('filename="')[1].split('"')[0] + return filename + response = requests.get(data_url) + if "text/html" in response.headers.get("Content-Type", ""): # Big size file case + soup = BeautifulSoup(response.text, "html.parser") + filename_div = soup.find("span", {"class": "uc-name-size"}) + if filename_div: + filename = filename_div.find("a").text + return filename + return None else: - filename = _basename(data_url) - return filename + response = requests.head(data_url, allow_redirects=True) + content_disposition = response.headers.get("Content-Disposition") + if content_disposition: + filename = re.findall("filename=(.+)", content_disposition) + return filename[0].strip('"').strip("'") + else: + filename = _basename(data_url) + return filename except Exception as e: raise Exception(f"Error processing URL: {e}") @@ -344,21 +359,18 @@ def download_and_extract( be False. progress: whether to display progress bar. """ + url_filename_ext = "".join(Path(get_filename_from_url(url)).suffixes) + filepath_ext = "".join(Path(_basename(filepath)).suffixes) + if filepath not in ["", "."]: + if filepath_ext == "": + new_filepath = Path(filepath).with_suffix(url_filename_ext) + logger.warning( + f"filepath={filepath}, which missing file extension. Auto-appending extension to: {new_filepath}" + ) + filepath = new_filepath + if filepath_ext and filepath_ext != url_filename_ext: + raise ValueError(f"File extension mismatch: expected extension {url_filename_ext}, but get {filepath_ext}") with tempfile.TemporaryDirectory() as tmp_dir: - if not filepath: - filename = get_filename_from_url(url) - full_path = Path(tmp_dir, filename) - elif os.path.isdir(filepath) or not os.path.splitext(filepath)[1]: - filename = get_filename_from_url(url) - full_path = Path(os.path.join(filepath, filename)) - logger.warning(f"No compress file extension provided, downloading as: '{full_path}'") - else: - url_filename_ext = "".join(Path(".", _basename(url)).resolve().suffixes) - filepath_ext = "".join(Path(".", _basename(filepath)).resolve().suffixes) - if filepath_ext != url_filename_ext: - raise ValueError( - f"File extension mismatch: expected extension {url_filename_ext}, but get {filepath_ext}" - ) - full_path = Path(filepath) - download_url(url=url, filepath=full_path, hash_val=hash_val, hash_type=hash_type, progress=progress) - extractall(filepath=full_path, output_dir=output_dir, file_type=file_type, has_base=has_base) + filename = filepath or Path(tmp_dir, get_filename_from_url(url)).resolve() + download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) + extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) From fcc269e9f5c06a1458b7696f2e37e47f696def56 Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Thu, 19 Dec 2024 00:42:30 +0800 Subject: [PATCH 08/11] Enhance download_and_extract Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index 3e5f29ba56..9b19e6d147 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -301,7 +301,7 @@ def extractall( ) -def get_filename_from_url(data_url: str): +def _get_filename_from_url(data_url: str): try: if "drive.google.com" in data_url: response = requests.head(data_url, allow_redirects=True) @@ -327,7 +327,7 @@ def get_filename_from_url(data_url: str): filename = _basename(data_url) return filename except Exception as e: - raise Exception(f"Error processing URL: {e}") + raise Exception(f"Error processing URL: {e}") from e def download_and_extract( @@ -359,7 +359,7 @@ def download_and_extract( be False. progress: whether to display progress bar. """ - url_filename_ext = "".join(Path(get_filename_from_url(url)).suffixes) + url_filename_ext = "".join(Path(_get_filename_from_url(url)).suffixes) filepath_ext = "".join(Path(_basename(filepath)).suffixes) if filepath not in ["", "."]: if filepath_ext == "": @@ -371,6 +371,6 @@ def download_and_extract( if filepath_ext and filepath_ext != url_filename_ext: raise ValueError(f"File extension mismatch: expected extension {url_filename_ext}, but get {filepath_ext}") with tempfile.TemporaryDirectory() as tmp_dir: - filename = filepath or Path(tmp_dir, get_filename_from_url(url)).resolve() + filename = filepath or Path(tmp_dir, _get_filename_from_url(url)).resolve() download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) From 109a1aa815a033ce97ebd79e27afd2289302660b Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Thu, 19 Dec 2024 01:00:54 +0800 Subject: [PATCH 09/11] Enhance download_and_extract Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 6 +++--- tests/test_download_and_extract.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index 9b19e6d147..50624188c6 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -301,7 +301,7 @@ def extractall( ) -def _get_filename_from_url(data_url: str): +def get_filename_from_url(data_url: str): try: if "drive.google.com" in data_url: response = requests.head(data_url, allow_redirects=True) @@ -359,7 +359,7 @@ def download_and_extract( be False. progress: whether to display progress bar. """ - url_filename_ext = "".join(Path(_get_filename_from_url(url)).suffixes) + url_filename_ext = "".join(Path(get_filename_from_url(url)).suffixes) filepath_ext = "".join(Path(_basename(filepath)).suffixes) if filepath not in ["", "."]: if filepath_ext == "": @@ -371,6 +371,6 @@ def download_and_extract( if filepath_ext and filepath_ext != url_filename_ext: raise ValueError(f"File extension mismatch: expected extension {url_filename_ext}, but get {filepath_ext}") with tempfile.TemporaryDirectory() as tmp_dir: - filename = filepath or Path(tmp_dir, _get_filename_from_url(url)).resolve() + filename = filepath or Path(tmp_dir, get_filename_from_url(url)).resolve() download_url(url=url, filepath=filename, hash_val=hash_val, hash_type=hash_type, progress=progress) extractall(filepath=filename, output_dir=output_dir, file_type=file_type, has_base=has_base) diff --git a/tests/test_download_and_extract.py b/tests/test_download_and_extract.py index 555f7dc250..439a11bbc1 100644 --- a/tests/test_download_and_extract.py +++ b/tests/test_download_and_extract.py @@ -20,9 +20,10 @@ from parameterized import parameterized from monai.apps import download_and_extract, download_url, extractall -from tests.utils import skip_if_downloading_fails, skip_if_quick, testing_data_config +from tests.utils import SkipIfNoModule, skip_if_downloading_fails, skip_if_quick, testing_data_config +@SkipIfNoModule("requests") class TestDownloadAndExtract(unittest.TestCase): @skip_if_quick From bc449083e646eec03bc7f009f2f75ced224279af Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Thu, 19 Dec 2024 01:19:46 +0800 Subject: [PATCH 10/11] fix mypy fail Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index 50624188c6..5b51ae1ab0 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -301,7 +301,7 @@ def extractall( ) -def get_filename_from_url(data_url: str): +def get_filename_from_url(data_url: str) -> str: try: if "drive.google.com" in data_url: response = requests.head(data_url, allow_redirects=True) From b5033edda05eb7d93e6535210fbdaf7345f569fa Mon Sep 17 00:00:00 2001 From: jerome_Hsieh Date: Thu, 19 Dec 2024 15:40:10 +0800 Subject: [PATCH 11/11] Enhance download_and_extract Signed-off-by: jerome_Hsieh --- monai/apps/utils.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/monai/apps/utils.py b/monai/apps/utils.py index 5b51ae1ab0..95c1450f2a 100644 --- a/monai/apps/utils.py +++ b/monai/apps/utils.py @@ -302,30 +302,24 @@ def extractall( def get_filename_from_url(data_url: str) -> str: + """ + Get the filename from the URL link. + """ try: + response = requests.head(data_url, allow_redirects=True) + content_disposition = response.headers.get("Content-Disposition") + if content_disposition: + filename = re.findall('filename="?([^";]+)"?', content_disposition) + if filename: + return str(filename[0]) if "drive.google.com" in data_url: - response = requests.head(data_url, allow_redirects=True) - cd = response.headers.get("Content-Disposition") # Normal size file case - if cd: - filename = cd.split('filename="')[1].split('"')[0] - return filename response = requests.get(data_url) - if "text/html" in response.headers.get("Content-Type", ""): # Big size file case + if "text/html" in response.headers.get("Content-Type", ""): soup = BeautifulSoup(response.text, "html.parser") filename_div = soup.find("span", {"class": "uc-name-size"}) if filename_div: - filename = filename_div.find("a").text - return filename - return None - else: - response = requests.head(data_url, allow_redirects=True) - content_disposition = response.headers.get("Content-Disposition") - if content_disposition: - filename = re.findall("filename=(.+)", content_disposition) - return filename[0].strip('"').strip("'") - else: - filename = _basename(data_url) - return filename + return str(filename_div.find("a").text) + return _basename(data_url) except Exception as e: raise Exception(f"Error processing URL: {e}") from e