-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-29823 Control WAL flush and offset persistence from ReplicationSourceShipper #7617
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| import static org.apache.hadoop.hbase.replication.ReplicationUtils.sleepForRetries; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.fs.Path; | ||
|
|
@@ -74,6 +75,12 @@ public enum WorkerState { | |
| private final int DEFAULT_TIMEOUT = 20000; | ||
| private final int getEntriesTimeout; | ||
| private final int shipEditsTimeout; | ||
| private long accumulatedSizeSinceLastUpdate = 0L; | ||
| private long lastOffsetUpdateTime = EnvironmentEdgeManager.currentTime(); | ||
| private long offsetUpdateIntervalMs; | ||
| private long offsetUpdateSizeThresholdBytes; | ||
| private WALEntryBatch lastShippedBatch; | ||
| private final List<Entry> entriesForCleanUpHFileRefs = new ArrayList<>(); | ||
|
|
||
| public ReplicationSourceShipper(Configuration conf, String walGroupId, ReplicationSource source, | ||
| ReplicationSourceWALReader walReader) { | ||
|
|
@@ -90,6 +97,10 @@ public ReplicationSourceShipper(Configuration conf, String walGroupId, Replicati | |
| this.conf.getInt("replication.source.getEntries.timeout", DEFAULT_TIMEOUT); | ||
| this.shipEditsTimeout = this.conf.getInt(HConstants.REPLICATION_SOURCE_SHIPEDITS_TIMEOUT, | ||
| HConstants.REPLICATION_SOURCE_SHIPEDITS_TIMEOUT_DFAULT); | ||
| this.offsetUpdateIntervalMs = | ||
| conf.getLong("hbase.replication.shipper.offset.update.interval.ms", Long.MAX_VALUE); | ||
| this.offsetUpdateSizeThresholdBytes = | ||
| conf.getLong("hbase.replication.shipper.offset.update.size.threshold", -1L); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -106,9 +117,25 @@ public final void run() { | |
| continue; | ||
| } | ||
| try { | ||
| WALEntryBatch entryBatch = entryReader.poll(getEntriesTimeout); | ||
| // check time-based offset persistence | ||
| if (shouldPersistLogPosition()) { | ||
| // Trigger offset persistence via existing retry/backoff mechanism in shipEdits() | ||
| WALEntryBatch emptyBatch = createEmptyBatchForTimeBasedFlush(); | ||
| if (emptyBatch != null) shipEdits(emptyBatch); | ||
| } | ||
|
|
||
| long pollTimeout = getEntriesTimeout; | ||
| if (offsetUpdateIntervalMs != Long.MAX_VALUE) { | ||
| long elapsed = EnvironmentEdgeManager.currentTime() - lastOffsetUpdateTime; | ||
| long remaining = offsetUpdateIntervalMs - elapsed; | ||
| if (remaining > 0) { | ||
| pollTimeout = Math.min(getEntriesTimeout, remaining); | ||
| } | ||
| } | ||
| WALEntryBatch entryBatch = entryReader.poll(pollTimeout); | ||
| LOG.debug("Shipper from source {} got entry batch from reader: {}", source.getQueueId(), | ||
| entryBatch); | ||
|
|
||
| if (entryBatch == null) { | ||
| continue; | ||
| } | ||
|
|
@@ -133,6 +160,16 @@ public final void run() { | |
| } | ||
| } | ||
|
|
||
| private WALEntryBatch createEmptyBatchForTimeBasedFlush() { | ||
| // Reuse last shipped WAL position with 0 entries | ||
| if (lastShippedBatch == null) { | ||
| return null; | ||
| } | ||
| WALEntryBatch batch = new WALEntryBatch(0, lastShippedBatch.getLastWalPath()); | ||
| batch.setLastWalPosition(lastShippedBatch.getLastWalPosition()); | ||
| return batch; | ||
| } | ||
|
|
||
| private void noMoreData() { | ||
| if (source.isRecovered()) { | ||
| LOG.debug("Finished recovering queue for group {} of peer {}", walGroupId, | ||
|
|
@@ -154,15 +191,16 @@ protected void postFinish() { | |
| private void shipEdits(WALEntryBatch entryBatch) { | ||
| List<Entry> entries = entryBatch.getWalEntries(); | ||
| int sleepMultiplier = 0; | ||
| if (entries.isEmpty()) { | ||
| updateLogPosition(entryBatch); | ||
| return; | ||
| } | ||
| int currentSize = (int) entryBatch.getHeapSize(); | ||
| source.getSourceMetrics() | ||
| .setTimeStampNextToReplicate(entries.get(entries.size() - 1).getKey().getWriteTime()); | ||
| while (isActive()) { | ||
| try { | ||
| if (entries.isEmpty()) { | ||
| lastShippedBatch = entryBatch; | ||
| persistLogPosition(); | ||
| return; | ||
| } | ||
| try { | ||
| source.tryThrottle(currentSize); | ||
| } catch (InterruptedException e) { | ||
|
|
@@ -190,13 +228,13 @@ private void shipEdits(WALEntryBatch entryBatch) { | |
| } else { | ||
| sleepMultiplier = Math.max(sleepMultiplier - 1, 0); | ||
| } | ||
| // Clean up hfile references | ||
| for (Entry entry : entries) { | ||
| cleanUpHFileRefs(entry.getEdit()); | ||
| LOG.trace("shipped entry {}: ", entry); | ||
|
|
||
| accumulatedSizeSinceLastUpdate += currentSize; | ||
| entriesForCleanUpHFileRefs.addAll(entries); | ||
| lastShippedBatch = entryBatch; | ||
| if (shouldPersistLogPosition()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than having these
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We want to determine whether we need to persist the log position in shipper, based on some configurations, not triggered by replication endpoint. Users can choose different configuration values based on different replication endpoint implementations.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO, it doesn't look much cohesive. Shipper seems to be taking decisions based on specific endpoint implementations. What if new endpoint impls with different logic for updating log position are thought in the future, we would need to revisit the shipper again.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think time based and size based persistency is enough for most cases? If in the future we have some special endpoint which needs new type of decision way, we can add new mechanism, no problem. The problem here why we do not want to only trigger persistency from endpoint is that, we have other considerations about when to persist the log position, like the trade off between failover and pressure on replication storage. So here I suggest that we introduce general mechanisms to control the behavior of persistency of log position, users can tune it based on different approach.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, so your idea is to allow shipper to decide persist log position based on time and/or stg usage by wals regarldess of the endpoint implementation? That would be fine for me, but then we would need to adjust the shouldPersistLogPosition method accordingly.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Are you referring to your original comment about passing the entire
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, I meant this check that was being done inside "shouldPersistLogPosition", which would cause the buffer size to be only considered for specific endpoint types. @ankitsol has already addressed it on a recent commit. |
||
| persistLogPosition(); | ||
| } | ||
| // Log and clean up WAL logs | ||
| updateLogPosition(entryBatch); | ||
|
|
||
| // offsets totalBufferUsed by deducting shipped batchSize (excludes bulk load size) | ||
| // this sizeExcludeBulkLoad has to use same calculation that when calling | ||
|
|
@@ -215,6 +253,13 @@ private void shipEdits(WALEntryBatch entryBatch) { | |
| entryBatch.getNbOperations(), (endTimeNs - startTimeNs) / 1000000); | ||
| } | ||
| break; | ||
| } catch (IOException ioe) { | ||
| // Offset-Persist failure is treated as fatal to this worker since it might come from | ||
| // beforePersistingReplicationOffset. | ||
| // ReplicationSource will restart the Shipper, and WAL reading | ||
| // will resume from the last successfully persisted offset | ||
| throw new ReplicationRuntimeException( | ||
| "Failed to persist replication offset; restarting shipper for WAL replay", ioe); | ||
| } catch (Exception ex) { | ||
| source.getSourceMetrics().incrementFailedBatches(); | ||
| LOG.warn("{} threw unknown exception:", | ||
|
|
@@ -229,6 +274,41 @@ private void shipEdits(WALEntryBatch entryBatch) { | |
| } | ||
| } | ||
|
|
||
| private boolean shouldPersistLogPosition() { | ||
| if (accumulatedSizeSinceLastUpdate == 0 || lastShippedBatch == null) { | ||
| return false; | ||
| } | ||
|
|
||
| // Default behaviour to update offset immediately after replicate() | ||
| if (offsetUpdateSizeThresholdBytes == -1 && offsetUpdateIntervalMs == Long.MAX_VALUE) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a bit strange... If offsetUpdateSizeThresholdBytes is -1, then the below accumulatedSizeSinceLastUpdate >= offsetUpdateSizeThresholdBytes will always returns true, so we do not need this check here?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is not needed, I just added here to make this default behaviour explicit |
||
| return true; | ||
| } | ||
|
|
||
| return (accumulatedSizeSinceLastUpdate >= offsetUpdateSizeThresholdBytes) | ||
| || (EnvironmentEdgeManager.currentTime() - lastOffsetUpdateTime >= offsetUpdateIntervalMs); | ||
| } | ||
|
Comment on lines
+277
to
+289
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be in the Endpoint, as the decision varies according to the type of endpoint. Today we have basically two types, buffered and non-buffered. If we have new endpoint types in the future, we might again need to come here and add the related logic.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please see my comment above to get more context
|
||
|
|
||
| private void persistLogPosition() throws IOException { | ||
| if (lastShippedBatch == null) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we could cumulate different batches in the above loop, a null batch does not mean we haven't shipped anything out? Why here we just return if lastShippedBatch is null?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I understand, lastShippedBatch 'null' means no batch has been replicated yet, so we don't need to update offset. Please correct me if I am wrong here lastShippedBatch is by default 'null' during ReplicationSourceShipper initialisation and as soon as a batch is replicated it is updated.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, OK, you do not rese the lastShippedBatch when reading a new batch. But it still makes me a bit nervous that how can we get here when lastShippedBatch is null...
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We cannot. This method is not getting called if lastShippedBatch is null in the current implementation. We can still keep this check here for safety reasons. |
||
| return; | ||
| } | ||
|
|
||
| ReplicationEndpoint endpoint = source.getReplicationEndpoint(); | ||
| endpoint.beforePersistingReplicationOffset(); | ||
|
|
||
| // Clean up hfile references | ||
| for (Entry entry : entriesForCleanUpHFileRefs) { | ||
| cleanUpHFileRefs(entry.getEdit()); | ||
| } | ||
| entriesForCleanUpHFileRefs.clear(); | ||
|
|
||
| accumulatedSizeSinceLastUpdate = 0; | ||
| lastOffsetUpdateTime = EnvironmentEdgeManager.currentTime(); | ||
|
|
||
| // Log and clean up WAL logs | ||
| updateLogPosition(lastShippedBatch); | ||
| } | ||
|
|
||
| private void cleanUpHFileRefs(WALEdit edit) throws IOException { | ||
| String peerId = source.getPeerId(); | ||
| if (peerId.contains("-")) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why move this here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I guess I know why you move this here, in the past updateLogPosition can not fail(it will lead to an abort so we can assume it never fails), but now since we have introduced a callback method, it could fail.
Then I do not think this is the correct way to deal with this. Consider the S3 based solution, if you fail to commit the file on S3, then the correct way is to send the data again? But here we just retry committing... I think we should restart from the previous persist offset and send data again.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we just abort this shipper?
Is
ReplicationSourcegoing to create a new one if it's aborted?In that case it will pick up at the last persisted position and retry correctly, won't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently in
shipEdits(), the only place where an IOException can be raised is frompersistLogPosition()(viabeforePersistingReplicationOffset()orcleanUpHFileRefs()).Based on your earlier comment about restarting from the last persisted offset and resending WAL entries, I am thinking of splitting the existing catch block into two parts:
The intention is that propagating
ReplicationRuntimeExceptionwill cause the shipper worker to exit, and ReplicationSource will recreate it, so WAL reading resumes from the last persisted offset and all batches since then are resent.Does this approach match what you had in mind? @Apache9
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
run() method has this catch block to handle
ReplicationRuntimeExceptionThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @Apache9, just a gentle ping on my previous comment to check if I'm heading in the right direction. Thank you!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if the current logic in ReplicationSource can handle shipper abort, but I think this is a possible way to deal with the problem.
When restarting, we read from the external replication offset storage and restart from the offset.
I think this could also be used to deal with the updateLogPosition exception, so we do not need to abort the whole region server.