diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ada31eb9..7d3e9b14e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - More permissive collection extent deserialization ([#1559](https://github.com/stac-utils/pystac/pull/1559)) - Type of `proj:code` setter ([#1560](https://github.com/stac-utils/pystac/pull/1560)) +- Use `urllib3` to fix parsing non-ascii in urls ([#1566](https://github.com/stac-utils/pystac/pull/1566)) ## [v1.13.0] - 2025-04-15 diff --git a/README.md b/README.md index 2be4b38eb..6e106290a 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,8 @@ python -m pip install 'pystac[orjson]' ``` If you would like to use a custom `RetryStacIO` class for automatically retrying -network requests when reading with PySTAC, you'll need +network requests when reading with PySTAC, or if you have non-ASCII characters in +your urls you'll need [`urllib3`](https://urllib3.readthedocs.io/en/stable/): ```shell diff --git a/pyproject.toml b/pyproject.toml index 74bf4d847..51f7bcbc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ dev = [ "types-orjson>=3.6.2", "types-python-dateutil>=2.9.0.20241003", "types-urllib3>=1.26.25.14", + "urllib3>=2.3.0", "virtualenv>=20.26.6", ] docs = [ diff --git a/pystac/stac_io.py b/pystac/stac_io.py index fa71dd7c0..4da8aec2e 100644 --- a/pystac/stac_io.py +++ b/pystac/stac_io.py @@ -286,8 +286,9 @@ def read_text_from_href(self, href: str) -> str: """Reads file as a UTF-8 string. If ``href`` has a "scheme" (e.g. if it starts with "https://") then this will - use :func:`urllib.request.urlopen` to open the file and read the contents; - otherwise, :func:`open` will be used to open a local file. + use :func:`urllib.request.urlopen` (or func:`urllib3.request` if available) + to open the file and read the contents; otherwise, :func:`open` will be used + to open a local file. Args: @@ -297,9 +298,19 @@ def read_text_from_href(self, href: str) -> str: if _is_url(href): try: logger.debug(f"GET {href} Headers: {self.headers}") - req = Request(href, headers=self.headers) - with urlopen(req) as f: - href_contents = f.read().decode("utf-8") + if HAS_URLLIB3: + with urllib3.request( + "GET", + href, + headers=self.headers, + preload_content=False, # type: ignore + ) as f: + href_contents = f.read().decode("utf-8") + else: + req = Request(href, headers=self.headers) + with urlopen(req) as f: + href_contents = f.read().decode("utf-8") + except HTTPError as e: raise Exception(f"Could not read uri {href}") from e else: diff --git a/tests/cassettes/test_stac_io/test_urls_with_non_ascii_characters.yaml b/tests/cassettes/test_stac_io/test_urls_with_non_ascii_characters.yaml new file mode 100644 index 000000000..3a31e7b55 --- /dev/null +++ b/tests/cassettes/test_stac_io/test_urls_with_non_ascii_characters.yaml @@ -0,0 +1,85 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: https://capella-open-data.s3.us-west-2.amazonaws.com/stac/capella-open-data-by-capital/capella-open-data-mal%C3%A9/collection.json + response: + body: + string: "{\n \"type\": \"Collection\",\n \"id\": \"capella-open-data-mal\\u00e9\",\n + \ \"stac_version\": \"1.0.0\",\n \"description\": \"Capella Open Data Mal\\u00e9\",\n + \ \"links\": [\n {\n \"rel\": \"root\",\n \"href\": \"../../catalog.json\",\n + \ \"type\": \"application/json\",\n \"title\": \"Capella Open Data\"\n + \ },\n {\n \"rel\": \"license\",\n \"href\": \"https://creativecommons.org/licenses/by/4.0/\",\n + \ \"title\": \"CC BY 4.0\"\n },\n {\n \"rel\": \"item\",\n + \ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_GEO_HH_20241130164247_20241130164315/CAPELLA_C09_SP_GEO_HH_20241130164247_20241130164315.json\",\n + \ \"type\": \"application/json\"\n },\n {\n \"rel\": \"item\",\n + \ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_GEC_HH_20241130164247_20241130164315/CAPELLA_C09_SP_GEC_HH_20241130164247_20241130164315.json\",\n + \ \"type\": \"application/json\"\n },\n {\n \"rel\": \"item\",\n + \ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_SICD_HH_20241130164247_20241130164315/CAPELLA_C09_SP_SICD_HH_20241130164247_20241130164315.json\",\n + \ \"type\": \"application/json\"\n },\n {\n \"rel\": \"item\",\n + \ \"href\": \"../../capella-open-data-by-datetime/capella-open-data-2024/capella-open-data-2024-11/capella-open-data-2024-11-30/CAPELLA_C09_SP_SLC_HH_20241130164247_20241130164315/CAPELLA_C09_SP_SLC_HH_20241130164247_20241130164315.json\",\n + \ \"type\": \"application/json\"\n },\n {\n \"rel\": \"parent\",\n + \ \"href\": \"../catalog.json\",\n \"type\": \"application/json\",\n + \ \"title\": \"By Capital\"\n }\n ],\n \"stac_extensions\": [\n \"https://stac-extensions.github.io/sat/v1.0.0/schema.json\",\n + \ \"https://stac-extensions.github.io/view/v1.0.0/schema.json\",\n \"https://stac-extensions.github.io/processing/v1.1.0/schema.json\",\n + \ \"https://stac-extensions.github.io/projection/v1.1.0/schema.json\"\n + \ ],\n \"item_assets\": {\n \"HH\": {\n \"title\": \"SAR file\",\n + \ \"type\": \"image/tiff; application=geotiff\",\n \"roles\": [\n + \ \"data\"\n ],\n \"sar:polarizations\": [\n \"HH\"\n + \ ]\n },\n \"VV\": {\n \"title\": \"SAR file\",\n \"type\": + \"image/tiff; application=geotiff\",\n \"roles\": [\n \"data\"\n + \ ],\n \"sar:polarizations\": [\n \"VV\"\n ]\n },\n + \ \"thumbnail\": {\n \"title\": \"Thumbnail\",\n \"type\": \"image/png\",\n + \ \"roles\": [\n \"thumbnail\"\n ]\n },\n \"preview\": + {\n \"title\": \"Preview image\",\n \"type\": \"image/tiff; application=geotiff; + profile=cloud-optimized\",\n \"roles\": [\n \"overview\"\n ]\n + \ },\n \"metadata\": {\n \"title\": \"Extended metadata\",\n \"type\": + \"application/json\",\n \"roles\": [\n \"metadata\"\n ]\n + \ }\n },\n \"title\": \"Mal\\u00e9\",\n \"extent\": {\n \"spatial\": + {\n \"bbox\": [\n [\n -180,\n -90,\n 180,\n + \ 90\n ]\n ]\n },\n \"temporal\": {\n \"interval\": + [\n [\n \"2020-03-30T00:00:00Z\",\n null\n ]\n + \ ]\n }\n },\n \"license\": \"proprietary\",\n \"keywords\": [\n + \ \"sar\"\n ],\n \"providers\": [\n {\n \"name\": \"Capella Space\",\n + \ \"roles\": [\n \"licensor\",\n \"producer\",\n \"processor\"\n + \ ],\n \"url\": \"https://www.capellaspace.com\"\n },\n {\n + \ \"name\": \"AWS\",\n \"roles\": [\n \"host\"\n ],\n + \ \"url\": \"http://www.amazonaws.com/\"\n }\n ],\n \"summaries\": + {\n \"constellation\": [\n \"capella\"\n ],\n \"instruments\": + [\n \"capella-radar-2\",\n \"capella-radar-3\",\n \"capella-radar-4\",\n + \ \"capella-radar-5\",\n \"capella-radar-6\",\n \"capella-radar-7\",\n + \ \"capella-radar-8\",\n \"capella-radar-9\",\n \"capella-radar-10\",\n + \ \"capella-radar-11\",\n \"capella-radar-13\",\n \"capella-radar-14\",\n + \ \"capella-radar-15\"\n ],\n \"sar:frequency_band\": [\n \"X\"\n + \ ],\n \"sar:product_type\": [\n \"SLC\",\n \"GEO\",\n \"GEC\",\n + \ \"SICD\",\n \"SIDD\",\n \"CPHD\"\n ],\n \"sar:instrument_mode\": + [\n \"stripmap\",\n \"spotlight\",\n \"sliding_spotlight\"\n + \ ],\n \"sar:observation_direction\": [\n \"left\",\n \"right\"\n + \ ],\n \"sar:polarizations\": [\n \"HH\",\n \"VV\"\n ]\n + \ }\n}" + headers: + Accept-Ranges: + - bytes + Content-Length: + - '4384' + Content-Type: + - application/json + Date: + - Wed, 16 Jul 2025 14:27:08 GMT + ETag: + - '"3a7d18f018a5dc5e29af78ba91fb3a67"' + Last-Modified: + - Wed, 16 Jul 2025 07:07:12 GMT + Server: + - AmazonS3 + x-amz-id-2: + - mS8cX+L1/uoMVhaC4ZTcrNLZb47iqTYwEXT52pFgbN9gFq3JRX17cTr8i0bTFOAaWWZ1DwnpogFTTAOAyjGj3Q== + x-amz-request-id: + - PWBXR28EN2M7DX2W + x-amz-server-side-encryption: + - AES256 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_stac_io.py b/tests/test_stac_io.py index 2488f1482..5ca8e8161 100644 --- a/tests/test_stac_io.py +++ b/tests/test_stac_io.py @@ -7,6 +7,7 @@ import pytest import pystac +import pystac.errors from pystac.stac_io import DefaultStacIO, DuplicateKeyReportingMixin, StacIO from tests.utils import TestCases @@ -116,20 +117,20 @@ class ReportingStacIO(DefaultStacIO, DuplicateKeyReportingMixin): assert str(excinfo.value), f'Found duplicate object name "key" in {src_href}' -@unittest.mock.patch("pystac.stac_io.urlopen") -def test_headers_stac_io(urlopen_mock: unittest.mock.MagicMock) -> None: +@unittest.mock.patch("pystac.stac_io.urllib3.request") +def test_headers_stac_io(request_mock: unittest.mock.MagicMock) -> None: stac_io = DefaultStacIO(headers={"Authorization": "api-key fake-api-key-value"}) catalog = pystac.Catalog("an-id", "a description").to_dict() # required until https://github.com/stac-utils/pystac/pull/896 is merged catalog["links"] = [] - urlopen_mock.return_value.__enter__.return_value.read.return_value = json.dumps( + request_mock.return_value.__enter__.return_value.read.return_value = json.dumps( catalog ).encode("utf-8") pystac.Catalog.from_file("https://example.com/catalog.json", stac_io=stac_io) - request_obj = urlopen_mock.call_args[0][0] - assert request_obj.headers == stac_io.headers + headers = request_mock.call_args[1]["headers"] + assert headers == stac_io.headers @pytest.mark.vcr() @@ -163,3 +164,16 @@ def test_save_http_href_errors(tmp_path: Path) -> None: catalog.set_self_href("http://pystac.test/catalog.json") with pytest.raises(NotImplementedError): catalog.save_object() + + +@pytest.mark.vcr() +def test_urls_with_non_ascii_characters() -> None: + from pystac.stac_io import HAS_URLLIB3 + + url = "https://capella-open-data.s3.us-west-2.amazonaws.com/stac/capella-open-data-by-capital/capella-open-data-malé/collection.json" + + if HAS_URLLIB3: + pystac.Collection.from_file(url) + else: + with pytest.raises(pystac.STACError): + pystac.Collection.from_file(url) diff --git a/uv.lock b/uv.lock index a7d4cb4a8..d6491fbb4 100644 --- a/uv.lock +++ b/uv.lock @@ -1990,6 +1990,7 @@ dev = [ { name = "types-orjson" }, { name = "types-python-dateutil" }, { name = "types-urllib3" }, + { name = "urllib3" }, { name = "virtualenv" }, ] docs = [ @@ -2042,6 +2043,7 @@ dev = [ { name = "types-orjson", specifier = ">=3.6.2" }, { name = "types-python-dateutil", specifier = ">=2.9.0.20241003" }, { name = "types-urllib3", specifier = ">=1.26.25.14" }, + { name = "urllib3", specifier = ">=2.3.0" }, { name = "virtualenv", specifier = ">=20.26.6" }, ] docs = [