Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,11 @@ public int getTimeout() {
* @throws IllegalStateException if this service's state isn't FAILED.
*/
Throwable failureCause();

/**
* Hook invoked before persisting replication offsets. Eg: Buffered endpoints can flush/close WALs
* here.
*/
default void beforePersistingReplicationOffset() throws IOException {
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
Expand Down Expand Up @@ -72,6 +74,7 @@

import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;

/**
* Class that handles the source of a replication stream. Currently does not handle more than 1
Expand Down Expand Up @@ -148,6 +151,12 @@ public class ReplicationSource implements ReplicationSourceInterface {
public static final int DEFAULT_WAIT_ON_ENDPOINT_SECONDS = 30;
private int waitOnEndpointSeconds = -1;

public static final String SHIPPER_MONITOR_INTERVAL =
"hbase.replication.source.shipper.monitor.interval.ms";
private static final long DEFAULT_SHIPPER_MONITOR_INTERVAL = 1 * 60 * 1000L; // 1 minute
private ScheduledExecutorService shipperMonitorExecutor;
private long monitorIntervalMs;

private Thread initThread;

/**
Expand Down Expand Up @@ -230,6 +239,16 @@ public void init(Configuration conf, FileSystem fs, ReplicationSourceManager man

LOG.info("queueId={}, ReplicationSource: {}, currentBandwidth={}", queueId,
replicationPeer.getId(), this.currentBandwidth);

this.monitorIntervalMs =
conf.getLong(SHIPPER_MONITOR_INTERVAL, DEFAULT_SHIPPER_MONITOR_INTERVAL);

this.shipperMonitorExecutor =
Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder()
.setNameFormat("ShipperMonitor-" + queueId).setDaemon(true).build());

this.shipperMonitorExecutor.scheduleAtFixedRate(this::restartDeadWorkersIfNeeded,
monitorIntervalMs, monitorIntervalMs, TimeUnit.MILLISECONDS);
}

private void decorateConf() {
Expand Down Expand Up @@ -758,6 +777,10 @@ private void terminate(String reason, Exception cause, boolean clearMetrics, boo
this.metrics.terminate();
}
}

if (shipperMonitorExecutor != null) {
shipperMonitorExecutor.shutdownNow();
}
}

@Override
Expand Down Expand Up @@ -866,4 +889,30 @@ public long getTotalReplicatedEdits() {
long getSleepForRetries() {
return sleepForRetries;
}

private void restartDeadWorkersIfNeeded() {
for (String walGroupId : workerThreads.keySet()) {
workerThreads.compute(walGroupId, (key, worker) -> {
if (worker == null) {
return null;
}

if (!worker.isAlive() && !worker.isFinished()) {
LOG.warn("Detected dead shipper for walGroupId={}. Restarting.", walGroupId);

try {
ReplicationSourceShipper newWorker = createNewShipper(walGroupId);
startShipper(newWorker);
return newWorker;
} catch (Exception e) {
LOG.error("Failed to restart shipper for walGroupId={}", walGroupId, e);
return worker; // keep old entry to retry later
}
}

return worker;
});
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import static org.apache.hadoop.hbase.replication.ReplicationUtils.sleepForRetries;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
Expand Down Expand Up @@ -74,6 +75,12 @@ public enum WorkerState {
private final int DEFAULT_TIMEOUT = 20000;
private final int getEntriesTimeout;
private final int shipEditsTimeout;
private long accumulatedSizeSinceLastUpdate = 0L;
private long lastOffsetUpdateTime = EnvironmentEdgeManager.currentTime();
private long offsetUpdateIntervalMs;
private long offsetUpdateSizeThresholdBytes;
private WALEntryBatch lastShippedBatch;
private final List<Entry> entriesForCleanUpHFileRefs = new ArrayList<>();

public ReplicationSourceShipper(Configuration conf, String walGroupId, ReplicationSource source,
ReplicationSourceWALReader walReader) {
Expand All @@ -90,6 +97,10 @@ public ReplicationSourceShipper(Configuration conf, String walGroupId, Replicati
this.conf.getInt("replication.source.getEntries.timeout", DEFAULT_TIMEOUT);
this.shipEditsTimeout = this.conf.getInt(HConstants.REPLICATION_SOURCE_SHIPEDITS_TIMEOUT,
HConstants.REPLICATION_SOURCE_SHIPEDITS_TIMEOUT_DFAULT);
this.offsetUpdateIntervalMs =
conf.getLong("hbase.replication.shipper.offset.update.interval.ms", Long.MAX_VALUE);
this.offsetUpdateSizeThresholdBytes =
conf.getLong("hbase.replication.shipper.offset.update.size.threshold", -1L);
}

@Override
Expand All @@ -106,9 +117,25 @@ public final void run() {
continue;
}
try {
WALEntryBatch entryBatch = entryReader.poll(getEntriesTimeout);
// check time-based offset persistence
if (shouldPersistLogPosition()) {
// Trigger offset persistence via existing retry/backoff mechanism in shipEdits()
WALEntryBatch emptyBatch = createEmptyBatchForTimeBasedFlush();
if (emptyBatch != null) shipEdits(emptyBatch);
}

long pollTimeout = getEntriesTimeout;
if (offsetUpdateIntervalMs != Long.MAX_VALUE) {
long elapsed = EnvironmentEdgeManager.currentTime() - lastOffsetUpdateTime;
long remaining = offsetUpdateIntervalMs - elapsed;
if (remaining > 0) {
pollTimeout = Math.min(getEntriesTimeout, remaining);
}
}
WALEntryBatch entryBatch = entryReader.poll(pollTimeout);
LOG.debug("Shipper from source {} got entry batch from reader: {}", source.getQueueId(),
entryBatch);

if (entryBatch == null) {
continue;
}
Expand All @@ -133,6 +160,16 @@ public final void run() {
}
}

private WALEntryBatch createEmptyBatchForTimeBasedFlush() {
// Reuse last shipped WAL position with 0 entries
if (lastShippedBatch == null) {
return null;
}
WALEntryBatch batch = new WALEntryBatch(0, lastShippedBatch.getLastWalPath());
batch.setLastWalPosition(lastShippedBatch.getLastWalPosition());
return batch;
}

private void noMoreData() {
if (source.isRecovered()) {
LOG.debug("Finished recovering queue for group {} of peer {}", walGroupId,
Expand All @@ -154,15 +191,16 @@ protected void postFinish() {
private void shipEdits(WALEntryBatch entryBatch) {
List<Entry> entries = entryBatch.getWalEntries();
int sleepMultiplier = 0;
if (entries.isEmpty()) {
updateLogPosition(entryBatch);
return;
}
int currentSize = (int) entryBatch.getHeapSize();
source.getSourceMetrics()
.setTimeStampNextToReplicate(entries.get(entries.size() - 1).getKey().getWriteTime());
while (isActive()) {
try {
if (entries.isEmpty()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why move this here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I guess I know why you move this here, in the past updateLogPosition can not fail(it will lead to an abort so we can assume it never fails), but now since we have introduced a callback method, it could fail.

Then I do not think this is the correct way to deal with this. Consider the S3 based solution, if you fail to commit the file on S3, then the correct way is to send the data again? But here we just retry committing... I think we should restart from the previous persist offset and send data again.

Copy link
Contributor

@anmolnar anmolnar Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just abort this shipper?
Is ReplicationSource going to create a new one if it's aborted?
In that case it will pick up at the last persisted position and retry correctly, won't it?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently in shipEdits(), the only place where an IOException can be raised is from persistLogPosition() (via beforePersistingReplicationOffset() or cleanUpHFileRefs()).

Based on your earlier comment about restarting from the last persisted offset and resending WAL entries, I am thinking of splitting the existing catch block into two parts:

catch (IOException ioe) {
  // Persist failure → fatal → restart worker → WAL replay from last persisted offset
  throw new ReplicationRuntimeException(
      "Failed to persist replication offset; restarting worker to replay WAL", ioe);

} catch (Exception ex) {
  // Replication/transient failure → retry current batch (existing behaviour)
  source.getSourceMetrics().incrementFailedBatches();
  LOG.warn("{} threw unknown exception:",
    source.getReplicationEndpoint().getClass().getName(), ex);

  if (sleepForRetries("ReplicationEndpoint threw exception",
      sleepForRetries, sleepMultiplier, maxRetriesMultiplier)) {
    sleepMultiplier++;
  }
}

The intention is that propagating ReplicationRuntimeException will cause the shipper worker to exit, and ReplicationSource will recreate it, so WAL reading resumes from the last persisted offset and all batches since then are resent.

Does this approach match what you had in mind? @Apache9

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

run() method has this catch block to handle ReplicationRuntimeException

catch (InterruptedException | ReplicationRuntimeException e) {
        // It is interrupted and needs to quit.
        LOG.warn("Interrupted while waiting for next replication entry batch", e);
        Thread.currentThread().interrupt();
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @Apache9, just a gentle ping on my previous comment to check if I'm heading in the right direction. Thank you!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if the current logic in ReplicationSource can handle shipper abort, but I think this is a possible way to deal with the problem.
When restarting, we read from the external replication offset storage and restart from the offset.
I think this could also be used to deal with the updateLogPosition exception, so we do not need to abort the whole region server.

lastShippedBatch = entryBatch;
persistLogPosition();
return;
}
try {
source.tryThrottle(currentSize);
} catch (InterruptedException e) {
Expand Down Expand Up @@ -190,13 +228,13 @@ private void shipEdits(WALEntryBatch entryBatch) {
} else {
sleepMultiplier = Math.max(sleepMultiplier - 1, 0);
}
// Clean up hfile references
for (Entry entry : entries) {
cleanUpHFileRefs(entry.getEdit());
LOG.trace("shipped entry {}: ", entry);

accumulatedSizeSinceLastUpdate += currentSize;
entriesForCleanUpHFileRefs.addAll(entries);
lastShippedBatch = entryBatch;
if (shouldPersistLogPosition()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than having these stagedWalSize and lastShippedBatch as global variables, we should just pass the entryBatch along to shouldPersistLogPosition() (which should be defined/implemented in the endpoints, btw, see my other comment related) and persistLogPosition().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want to determine whether we need to persist the log position in shipper, based on some configurations, not triggered by replication endpoint. Users can choose different configuration values based on different replication endpoint implementations.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO, it doesn't look much cohesive. Shipper seems to be taking decisions based on specific endpoint implementations. What if new endpoint impls with different logic for updating log position are thought in the future, we would need to revisit the shipper again.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think time based and size based persistency is enough for most cases? If in the future we have some special endpoint which needs new type of decision way, we can add new mechanism, no problem.

The problem here why we do not want to only trigger persistency from endpoint is that, we have other considerations about when to persist the log position, like the trade off between failover and pressure on replication storage. So here I suggest that we introduce general mechanisms to control the behavior of persistency of log position, users can tune it based on different approach.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, so your idea is to allow shipper to decide persist log position based on time and/or stg usage by wals regarldess of the endpoint implementation? That would be fine for me, but then we would need to adjust the shouldPersistLogPosition method accordingly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be fine for me, but then we would need to adjust the shouldPersistLogPosition method accordingly.

Are you referring to your original comment about passing the entire entryBatch object?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you referring to your original comment about passing the entire entryBatch object?

No, I meant this check that was being done inside "shouldPersistLogPosition", which would cause the buffer size to be only considered for specific endpoint types.

@ankitsol has already addressed it on a recent commit.

persistLogPosition();
}
// Log and clean up WAL logs
updateLogPosition(entryBatch);

// offsets totalBufferUsed by deducting shipped batchSize (excludes bulk load size)
// this sizeExcludeBulkLoad has to use same calculation that when calling
Expand All @@ -215,6 +253,13 @@ private void shipEdits(WALEntryBatch entryBatch) {
entryBatch.getNbOperations(), (endTimeNs - startTimeNs) / 1000000);
}
break;
} catch (IOException ioe) {
// Offset-Persist failure is treated as fatal to this worker since it might come from
// beforePersistingReplicationOffset.
// ReplicationSource will restart the Shipper, and WAL reading
// will resume from the last successfully persisted offset
throw new ReplicationRuntimeException(
"Failed to persist replication offset; restarting shipper for WAL replay", ioe);
} catch (Exception ex) {
source.getSourceMetrics().incrementFailedBatches();
LOG.warn("{} threw unknown exception:",
Expand All @@ -229,6 +274,41 @@ private void shipEdits(WALEntryBatch entryBatch) {
}
}

private boolean shouldPersistLogPosition() {
if (accumulatedSizeSinceLastUpdate == 0 || lastShippedBatch == null) {
return false;
}

// Default behaviour to update offset immediately after replicate()
if (offsetUpdateSizeThresholdBytes == -1 && offsetUpdateIntervalMs == Long.MAX_VALUE) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit strange...

If offsetUpdateSizeThresholdBytes is -1, then the below accumulatedSizeSinceLastUpdate >= offsetUpdateSizeThresholdBytes will always returns true, so we do not need this check here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is not needed, I just added here to make this default behaviour explicit

return true;
}

return (accumulatedSizeSinceLastUpdate >= offsetUpdateSizeThresholdBytes)
|| (EnvironmentEdgeManager.currentTime() - lastOffsetUpdateTime >= offsetUpdateIntervalMs);
}
Comment on lines +277 to +289
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be in the Endpoint, as the decision varies according to the type of endpoint. Today we have basically two types, buffered and non-buffered. If we have new endpoint types in the future, we might again need to come here and add the related logic.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see my comment above to get more context

For me, I do not think we need to expose this information to shipper?

The design here is that, when using different ReplicationEndpoint, you need to tune the shipper configuration by your own, as the parameters are not only affected by ReplicationEndpoint, they also depend on the shipper side.

For example, when you want to reduce the pressure on recording the offset, you should increase the record interval, i.e, increase batch size, increase the number of ship times between recording offset, etc. And if you want to reduce the pressure on memory and the target receiver, you should decrease the batch size, and for S3 based replication endpoint, there is also a trade off, if you increase the flush interval, you can get better performance and less files on S3, but failover will be more complicated as you need to start from long before.

So, this should be in the documentation, just exposing some configuration from ReplicationEndpoint can not handle all the above situations.


private void persistLogPosition() throws IOException {
if (lastShippedBatch == null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we could cumulate different batches in the above loop, a null batch does not mean we haven't shipped anything out? Why here we just return if lastShippedBatch is null?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I understand, lastShippedBatch 'null' means no batch has been replicated yet, so we don't need to update offset. Please correct me if I am wrong here

lastShippedBatch is by default 'null' during ReplicationSourceShipper initialisation and as soon as a batch is replicated it is updated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, OK, you do not rese the lastShippedBatch when reading a new batch. But it still makes me a bit nervous that how can we get here when lastShippedBatch is null...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot. This method is not getting called if lastShippedBatch is null in the current implementation. We can still keep this check here for safety reasons.

return;
}

ReplicationEndpoint endpoint = source.getReplicationEndpoint();
endpoint.beforePersistingReplicationOffset();

// Clean up hfile references
for (Entry entry : entriesForCleanUpHFileRefs) {
cleanUpHFileRefs(entry.getEdit());
}
entriesForCleanUpHFileRefs.clear();

accumulatedSizeSinceLastUpdate = 0;
lastOffsetUpdateTime = EnvironmentEdgeManager.currentTime();

// Log and clean up WAL logs
updateLogPosition(lastShippedBatch);
}

private void cleanUpHFileRefs(WALEdit edit) throws IOException {
String peerId = source.getPeerId();
if (peerId.contains("-")) {
Expand Down