diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index 1d51828fa4dd..619d1bba21b1 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -514,9 +514,20 @@ std::pair resolveObjectStorageForPath( normalized_path = "gs://" + target_decomposed.authority + "/" + target_decomposed.key; } S3::URI s3_uri(normalized_path); - - std::string key_to_use = s3_uri.key; - + + // Use key (parsed without URI decoding) so that percent-encoded + // characters in object keys (e.g. %2F in Iceberg partition paths) are preserved. + std::string key_to_use = target_decomposed.key; + + // For path-style HTTP(S) URLs, SchemeAuthorityKey puts / in .key + // because the bucket lives in the URL path, not the hostname. + // Strip the bucket prefix so the key is relative to the bucket. + if (!s3_uri.is_virtual_hosted_style + && key_to_use.starts_with(s3_uri.bucket + "/")) + { + key_to_use = key_to_use.substr(s3_uri.bucket.size() + 1); + } + bool use_base_storage = false; if (base_storage->getType() == ObjectStorageType::S3) { diff --git a/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp new file mode 100644 index 000000000000..f0a7e7613b25 --- /dev/null +++ b/src/Storages/ObjectStorage/tests/gtest_scheme_authority_key.cpp @@ -0,0 +1,149 @@ +#include + +#include +#include +#include "config.h" + +#if USE_AWS_S3 && USE_AVRO + +#include +#include +#include +#include +#include + +using namespace DB; + +namespace +{ +struct ClientFake : DB::S3::Client +{ + explicit ClientFake() + : DB::S3::Client( + 1, + DB::S3::ServerSideEncryptionKMSConfig(), + std::make_shared("test_access_key", "test_secret"), + DB::S3::ClientFactory::instance().createClientConfiguration( + "test_region", + DB::RemoteHostFilter(), + 1, + DB::S3::PocoHTTPClientConfiguration::RetryStrategy{.max_retries = 0}, + true, + true, + true, + false, + {}, + /* request_throttler = */ {}, + "http"), + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + DB::S3::ClientSettings()) + { + } + + Aws::S3::Model::GetObjectOutcome GetObject([[maybe_unused]] const Aws::S3::Model::GetObjectRequest &) const override + { + UNREACHABLE(); + } +}; + +std::shared_ptr makeBaseStorage(S3::URI uri) +{ + S3Capabilities cap; + ObjectStorageKeyGeneratorPtr gen; + const String disk_name = "s3"; + return std::make_shared( + std::make_unique(), std::make_unique(), std::move(uri), cap, gen, disk_name); +} + +void assertResolveReturnsBaseKey(const std::string & table_location, const std::string & path, S3::URI base_uri, const std::string & expected_key) +{ + auto base = makeBaseStorage(std::move(base_uri)); + SecondaryStorages secondary_storages; + auto [storage, key] = resolveObjectStorageForPath(table_location, path, base, secondary_storages, nullptr); + ASSERT_EQ(storage.get(), base.get()); + ASSERT_EQ(key, expected_key); +} +} + +TEST(ResolveObjectStorageForPathKey, S3SchemeKeyNoBucketInPath) +{ + const char * path = "s3://my-bucket/dir/file.parquet"; + S3::URI target(path); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, VirtualHostedHttpsKeyNoBucket) +{ + const char * path = "https://my-bucket.s3.amazonaws.com/dir/file.parquet"; + S3::URI target(path); + ASSERT_TRUE(target.is_virtual_hosted_style); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, PathStyleHttpsBucketPrefixStripped) +{ + const char * path = "https://s3.amazonaws.com/my-bucket/dir/file.parquet"; + S3::URI target(path); + ASSERT_FALSE(target.is_virtual_hosted_style); + ASSERT_EQ(target.bucket, "my-bucket"); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, PathStyleMinioPreservesPercentEncodedSlash) +{ + const char * path = "http://minio:9000/bucket/partition=us%2Fwest/data.parquet"; + S3::URI target(path); + ASSERT_FALSE(target.is_virtual_hosted_style); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("http://minio:9000/bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/data.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, S3SchemePreservesPercentEncodedSlash) +{ + const char * path = "s3://bucket/partition=us%2Fwest/file.parquet"; + S3::URI target(path); + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, VirtualHostedHttpsKeyMayStartWithBucketNamePrefix) +{ + const char * path = "https://bucket.s3.amazonaws.com/bucket/nested/file.parquet"; + S3::URI target(path); + ASSERT_TRUE(target.is_virtual_hosted_style); + + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + + assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "bucket/nested/file.parquet"); +} + +TEST(ResolveObjectStorageForPathKey, S3SchemeKeyMayStartWithBucketNamePrefix) +{ + const char * path = "s3://bucket/bucket/nested/file.parquet"; + S3::URI target(path); + ASSERT_TRUE(target.is_virtual_hosted_style) + << "Requires s3:// -> virtual-hosted mapping (global context); use HTTPS test if this fails"; + + S3::URI base_uri; + base_uri.bucket = target.bucket; + base_uri.endpoint = target.endpoint; + + assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "bucket/nested/file.parquet"); +} + +#endif diff --git a/tests/integration/test_database_iceberg/test.py b/tests/integration/test_database_iceberg/test.py index b0272161c8ac..283f4dcf7d3a 100644 --- a/tests/integration/test_database_iceberg/test.py +++ b/tests/integration/test_database_iceberg/test.py @@ -680,6 +680,52 @@ def test_table_with_slash(started_cluster): assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}`") == "\\N\tAAPL\t193.24\t193.31\t('bot')\n" +def test_partition_value_with_slash(started_cluster): + """Partition value containing '/' produces object keys with %2F; reading must preserve encoding.""" + node = started_cluster.instances["node1"] + + test_ref = f"test_partition_slash_{uuid.uuid4()}" + table_name = f"{test_ref}_table" + root_namespace = f"{test_ref}_namespace" + + # Partition by symbol (string) so partition value "us/west" becomes path segment symbol=us%2Fwest + partition_spec = PartitionSpec( + PartitionField( + source_id=2, field_id=1000, transform=IdentityTransform(), name="symbol" + ) + ) + schema = DEFAULT_SCHEMA + + catalog = load_catalog_impl(started_cluster) + catalog.create_namespace(root_namespace) + + table = create_table( + catalog, + root_namespace, + table_name, + schema, + partition_spec=partition_spec, + sort_order=DEFAULT_SORT_ORDER, + ) + + # Write a row with partition value containing slash (path will have %2F in S3 key) + data = [ + { + "datetime": datetime.now(), + "symbol": "us/west", + "bid": 100.0, + "ask": 101.0, + "details": {"created_by": "test"}, + } + ] + df = pa.Table.from_pylist(data) + table.append(df) + + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + assert 1 == int(node.query(f"SELECT count() FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`")) + assert "us/west" in node.query(f"SELECT symbol FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`") + + def test_cluster_select(started_cluster): node1 = started_cluster.instances["node1"] node2 = started_cluster.instances["node2"] @@ -720,7 +766,7 @@ def test_cluster_select(started_cluster): assert len(cluster_secondary_queries) == 1 assert node2.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`", settings={"parallel_replicas_for_cluster_engines":1, 'enable_parallel_replicas': 2, 'cluster_for_parallel_replicas': 'cluster_simple', 'parallel_replicas_for_cluster_engines' : 1}) == 'pablo\n' - + def test_not_specified_catalog_type(started_cluster): node = started_cluster.instances["node1"] settings = {