-
Notifications
You must be signed in to change notification settings - Fork 1.5k
JAVA-5949 preserve connection pool on backpressure errors when establishing connections #1900
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: backpressure
Are you sure you want to change the base?
Changes from all commits
c1d40fa
a0a7dbf
fc3603e
e78c931
4d836b1
61c53a8
37c930b
4f5ae84
0be0250
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
|
|
||
| package com.mongodb.internal.connection; | ||
|
|
||
| import com.mongodb.MongoException; | ||
| import com.mongodb.annotations.ThreadSafe; | ||
| import com.mongodb.connection.ClusterConnectionMode; | ||
| import com.mongodb.connection.ServerDescription; | ||
|
|
@@ -137,9 +138,27 @@ private void handleException(final SdamIssue sdamIssue, final boolean beforeHand | |
| serverMonitor.connect(); | ||
| } else if (sdamIssue.relatedToNetworkNotTimeout() | ||
| || (beforeHandshake && (sdamIssue.relatedToNetworkTimeout() || sdamIssue.relatedToAuth()))) { | ||
| updateDescription(sdamIssue.serverDescription()); | ||
| connectionPool.invalidate(sdamIssue.exception().orElse(null)); | ||
| serverMonitor.cancelCurrentCheck(); | ||
| // Backpressure spec: Don't clear pool or mark server unknown for connection establishment failures | ||
| // (network errors or timeouts during handshake). Authentication errors after handshake should still | ||
| // clear the pool as they're not related to overload. | ||
| // TLS configuration errors (certificate validation, protocol mismatches) should also clear the pool | ||
| // as they indicate configuration issues, not server overload. | ||
| if (beforeHandshake && !sdamIssue.relatedToAuth() && !sdamIssue.relatedToTlsConfigurationError()) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently we don’t distinguish DNS lookup failures The CMAP spec excludes DNS failures from backpressure labeling: `“For errors that the driver can distinguish as never occurring due to server overload, such as DNS lookup failures […] the driver MUST NOT add backpressure error labels for these error types.”. Proposed change: detect DNS failure by walking the exception cause chain for In that case, we should add coverage to assert that labeling and pool clearing behaviour. If the driver ever changes the wrapper exception type |
||
| // Don't update server description to Unknown | ||
| // Don't invalidate the connection pool | ||
| // Apply error labels for backpressure | ||
| sdamIssue.exception().ifPresent(exception -> { | ||
| if (exception instanceof MongoException) { | ||
| MongoException mongoException = (MongoException) exception; | ||
| mongoException.addLabel(MongoException.SYSTEM_OVERLOADED_ERROR_LABEL); | ||
| mongoException.addLabel(MongoException.RETRYABLE_ERROR_LABEL); | ||
| } | ||
| }); | ||
| } else { | ||
| updateDescription(sdamIssue.serverDescription()); | ||
| connectionPool.invalidate(sdamIssue.exception().orElse(null)); | ||
| serverMonitor.cancelCurrentCheck(); | ||
| } | ||
| } else if (sdamIssue.relatedToWriteConcern() || sdamIssue.relatedToStalePrimary()) { | ||
| updateDescription(sdamIssue.serverDescription()); | ||
| serverMonitor.connect(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
|
|
||
| import com.mongodb.ClusterFixture; | ||
| import com.mongodb.MongoClientSettings; | ||
| import com.mongodb.event.ConnectionCheckOutFailedEvent; | ||
| import com.mongodb.event.ConnectionPoolClearedEvent; | ||
| import com.mongodb.event.ConnectionPoolListener; | ||
| import com.mongodb.event.ConnectionPoolReadyEvent; | ||
|
|
@@ -47,7 +48,10 @@ | |
| import java.util.Set; | ||
| import java.util.concurrent.BlockingQueue; | ||
| import java.util.concurrent.CountDownLatch; | ||
| import java.util.concurrent.ExecutorService; | ||
| import java.util.concurrent.Executors; | ||
| import java.util.concurrent.LinkedBlockingQueue; | ||
| import java.util.concurrent.atomic.AtomicInteger; | ||
|
|
||
| import static com.mongodb.ClusterFixture.configureFailPoint; | ||
| import static com.mongodb.ClusterFixture.disableFailPoint; | ||
|
|
@@ -268,6 +272,79 @@ public void shouldEmitHeartbeatStartedBeforeSocketIsConnected() { | |
| // As it requires mocking and package access to `com.mongodb.internal.connection` | ||
| } | ||
|
|
||
| /** | ||
| * See | ||
| * <a href="https://github.com/mongodb/specifications/blob/master/source/server-discovery-and-monitoring/server-discovery-and-monitoring-tests.md#connection-pool-backpressure">Connection Pool Backpressure</a>. | ||
| */ | ||
| @Test | ||
| public void testConnectionPoolBackpressure() throws InterruptedException { | ||
| assumeTrue(serverVersionAtLeast(7, 0)); | ||
|
|
||
| AtomicInteger connectionCheckOutFailedEventCount = new AtomicInteger(0); | ||
| AtomicInteger poolClearedEventCount = new AtomicInteger(0); | ||
|
|
||
| ConnectionPoolListener connectionPoolListener = new ConnectionPoolListener() { | ||
| @Override | ||
| public void connectionCheckOutFailed(final ConnectionCheckOutFailedEvent event) { | ||
| connectionCheckOutFailedEventCount.incrementAndGet(); | ||
| } | ||
|
|
||
| @Override | ||
| public void connectionPoolCleared(final ConnectionPoolClearedEvent event) { | ||
| poolClearedEventCount.incrementAndGet(); | ||
| } | ||
| }; | ||
|
Comment on lines
+283
to
+296
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of introducing a new anonymous listener with counters, we can reuse the existing test listener: It already provides await helpers that double as assertions and helpers to assert that zero PoolClearedEvents happened, e.g.:
This keeps the test more concise and reuses established utilities for clarity/consistency. |
||
|
|
||
| MongoClientSettings clientSettings = getMongoClientSettingsBuilder() | ||
| .applyToConnectionPoolSettings(builder -> builder | ||
| .maxConnecting(100) | ||
| .addConnectionPoolListener(connectionPoolListener)) | ||
| .build(); | ||
|
|
||
| try (MongoClient adminClient = MongoClients.create(getMongoClientSettingsBuilder().build()); | ||
| MongoClient client = MongoClients.create(clientSettings)) { | ||
|
|
||
| MongoDatabase adminDatabase = adminClient.getDatabase("admin"); | ||
| MongoDatabase database = client.getDatabase(getDefaultDatabaseName()); | ||
| MongoCollection<Document> collection = database.getCollection("testCollection"); | ||
|
|
||
| // Configure rate limiter using admin commands | ||
| adminDatabase.runCommand(new Document("setParameter", 1) | ||
| .append("ingressConnectionEstablishmentRateLimiterEnabled", true)); | ||
| adminDatabase.runCommand(new Document("setParameter", 1) | ||
| .append("ingressConnectionEstablishmentRatePerSec", 20)); | ||
| adminDatabase.runCommand(new Document("setParameter", 1) | ||
| .append("ingressConnectionEstablishmentBurstCapacitySecs", 1)); | ||
| adminDatabase.runCommand(new Document("setParameter", 1) | ||
| .append("ingressConnectionEstablishmentMaxQueueDepth", 1)); | ||
|
|
||
| collection.insertOne(Document.parse("{}")); | ||
|
|
||
| // Run 100 parallel find operations with 2-seconds sleep | ||
| ExecutorService executor = Executors.newFixedThreadPool(100); | ||
| for (int i = 0; i < 100; i++) { | ||
| executor.submit(() -> collection.find(new Document("$where", "function() { sleep(2000); return true; }")).first()); | ||
| } | ||
|
|
||
| // Wait for all operations to complete | ||
| executor.shutdown(); | ||
| boolean terminated = executor.awaitTermination(20, SECONDS); | ||
| assertTrue("Executor did not terminate within timeout", terminated); | ||
|
|
||
| // Assert at least 10 ConnectionCheckOutFailedEvents occurred | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this comment? The assertion message already explains the intent (e.g., |
||
| assertTrue("Expected at least 10 ConnectionCheckOutFailedEvents, but got " + connectionCheckOutFailedEventCount.get(), | ||
| connectionCheckOutFailedEventCount.get() >= 10); | ||
|
|
||
| // Assert 0 PoolClearedEvents occurred | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| assertEquals("Expected 0 PoolClearedEvents", 0, poolClearedEventCount.get()); | ||
|
|
||
| // Teardown: sleep 1 second and reset rate limiter | ||
| Thread.sleep(1000); | ||
| adminDatabase.runCommand(new Document("setParameter", 1) | ||
| .append("ingressConnectionEstablishmentRateLimiterEnabled", false)); | ||
|
Comment on lines
+341
to
+344
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This cleanup is currently conditional on the code above completing successfully. If an assertion or exception happens earlier, this teardown won’t run, which can leak state into subsequent tests. We should move it this cleanup into |
||
| } | ||
| } | ||
|
|
||
| private static void assertPoll(final BlockingQueue<?> queue, @Nullable final Class<?> allowed, final Set<Class<?>> required) | ||
| throws InterruptedException { | ||
| assertPoll(queue, allowed, required, Timeout.expiresIn(TEST_WAIT_TIMEOUT_MILLIS, MILLISECONDS, ZERO_DURATION_MEANS_EXPIRED)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently we attach
SystemOverloadedErrorandRetryableErrorlabels in the SDAM error-handling path (effectively only forDefaultServer). In load-balanced mode, SDAM isn’t involved: the LB code path invalidates the pool directly (e.g.,connectionPool.invalidate(serviceId, generation)), so the labeling logic is bypassed.This means users running the driver in LB mode (behind an NLB) can still hit network errors, TLS handshake failures, timeouts during connection establishment or hello, but won’t get the labels.
However, these labels are a CMAP requirement, not SDAM. The CMAP spec states:
“The pool MUST add the error labels SystemOverloadedError and RetryableError to network errors or network timeouts it encounters during the connection establishment or the hello message.”Since this is defined as a pool behavior (topology-agnostic), it seems we should implement the labeling in the connection pool layer so it applies consistently in both default and load-balanced modes.