failedTests = new ArrayList<>();
+
+ public static void main(String[] args) throws Exception {
+ System.out.println("=== Consensus-Based Subscription Table Model Test Suite ===\n");
+
+ String targetTest = args.length > 0 ? args[0] : null;
+
+ if (targetTest == null || "testBasicFlow".equals(targetTest)) {
+ runTest("testBasicFlow", ConsensusSubscriptionTableTest::testBasicFlow);
+ }
+ if (targetTest == null || "testDataTypes".equals(targetTest)) {
+ runTest("testDataTypes", ConsensusSubscriptionTableTest::testDataTypes);
+ }
+ if (targetTest == null || "testPathFiltering".equals(targetTest)) {
+ runTest("testPathFiltering", ConsensusSubscriptionTableTest::testPathFiltering);
+ }
+ if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) {
+ runTest(
+ "testSubscribeBeforeRegion", ConsensusSubscriptionTableTest::testSubscribeBeforeRegion);
+ }
+ if (targetTest == null || "testRedelivery".equals(targetTest)) {
+ runTest("testRedelivery", ConsensusSubscriptionTableTest::testRedelivery);
+ }
+ if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) {
+ runTest("testMultiEntityIsolation", ConsensusSubscriptionTableTest::testMultiEntityIsolation);
+ }
+ if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) {
+ runTest(
+ "testBurstWriteGapRecovery", ConsensusSubscriptionTableTest::testBurstWriteGapRecovery);
+ }
+ if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) {
+ runTest(
+ "testCommitAfterUnsubscribe", ConsensusSubscriptionTableTest::testCommitAfterUnsubscribe);
+ }
+
+ // Summary
+ System.out.println("\n=== Test Suite Summary ===");
+ System.out.println("Passed: " + passed);
+ System.out.println("Failed: " + failed);
+ if (!failedTests.isEmpty()) {
+ System.out.println("Failed tests: " + failedTests);
+ }
+ System.out.println("=== Done ===");
+ }
+
+ // ============================
+ // Test Infrastructure
+ // ============================
+
+ @FunctionalInterface
+ interface TestMethod {
+ void run() throws Exception;
+ }
+
+ private static void runTest(String name, TestMethod test) {
+ System.out.println("\n" + "=================================================================");
+ System.out.println("Running: " + name);
+ System.out.println("=================================================================");
+ try {
+ test.run();
+ passed++;
+ System.out.println(">>> PASSED: " + name);
+ } catch (AssertionError e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> FAILED: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ } catch (Exception e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> ERROR: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ }
+ }
+
+ private static String nextDatabase() {
+ testCounter++;
+ return "csub_tbl_" + testCounter;
+ }
+
+ private static String nextTopic() {
+ return "topic_tbl_" + testCounter;
+ }
+
+ private static String nextConsumerGroup() {
+ return "cg_tbl_" + testCounter;
+ }
+
+ private static String nextConsumerId() {
+ return "consumer_tbl_" + testCounter;
+ }
+
+ private static ITableSession openTableSession() throws Exception {
+ return new TableSessionBuilder()
+ .nodeUrls(Collections.singletonList(HOST + ":" + PORT))
+ .username(USER)
+ .password(PASSWORD)
+ .build();
+ }
+
+ private static void createDatabaseAndTable(
+ ITableSession session, String database, String tableName, String tableSchema)
+ throws Exception {
+ session.executeNonQueryStatement("CREATE DATABASE IF NOT EXISTS " + database);
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement(String.format("CREATE TABLE %s (%s)", tableName, tableSchema));
+ }
+
+ private static void deleteDatabase(String database) {
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("DROP DATABASE IF EXISTS " + database);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void dropTopicTable(String topicName) {
+ try (ISubscriptionTableSession subSession =
+ new SubscriptionTableSessionBuilder()
+ .host(HOST)
+ .port(PORT)
+ .username(USER)
+ .password(PASSWORD)
+ .build()) {
+ subSession.dropTopicIfExists(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void createTopicTable(String topicName, String dbKey, String tableKey)
+ throws Exception {
+ try (ISubscriptionTableSession subSession =
+ new SubscriptionTableSessionBuilder()
+ .host(HOST)
+ .port(PORT)
+ .username(USER)
+ .password(PASSWORD)
+ .build()) {
+ try {
+ subSession.dropTopicIfExists(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+
+ Properties topicConfig = new Properties();
+ topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE);
+ topicConfig.put(
+ TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE);
+ topicConfig.put(TopicConstant.DATABASE_KEY, dbKey);
+ topicConfig.put(TopicConstant.TABLE_KEY, tableKey);
+ subSession.createTopic(topicName, topicConfig);
+ System.out.println(
+ " Created topic: " + topicName + " (database=" + dbKey + ", table=" + tableKey + ")");
+ }
+ }
+
+ private static ISubscriptionTablePullConsumer createConsumer(
+ String consumerId, String consumerGroupId) throws Exception {
+ ISubscriptionTablePullConsumer consumer =
+ new SubscriptionTablePullConsumerBuilder()
+ .host(HOST)
+ .port(PORT)
+ .consumerId(consumerId)
+ .consumerGroupId(consumerGroupId)
+ .autoCommit(false)
+ .build();
+ consumer.open();
+ return consumer;
+ }
+
+ // ============================
+ // Polling & Verification
+ // ============================
+
+ /**
+ * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive
+ * empty rounds to verify no extra data arrives.
+ */
+ private static PollResult pollUntilComplete(
+ ISubscriptionTablePullConsumer consumer, int expectedRows, int maxPollAttempts) {
+ return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true);
+ }
+
+ /**
+ * Poll until we accumulate the expected number of rows, then verify no extra data arrives.
+ *
+ * After reaching expectedRows, continues polling until 5 consecutive empty polls confirm
+ * quiescence. Any extra rows polled are included in the count (will break assertEquals).
+ *
+ * @param commitMessages if false, messages are NOT committed
+ */
+ private static PollResult pollUntilComplete(
+ ISubscriptionTablePullConsumer consumer,
+ int expectedRows,
+ int maxPollAttempts,
+ long pollTimeoutMs,
+ boolean commitMessages) {
+ PollResult result = new PollResult();
+ int consecutiveEmpty = 0;
+
+ for (int attempt = 1; attempt <= maxPollAttempts; attempt++) {
+ List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs));
+
+ if (messages.isEmpty()) {
+ consecutiveEmpty++;
+ // Normal completion: reached expected rows and verified quiescence
+ if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) {
+ System.out.println(
+ " Verified: "
+ + consecutiveEmpty
+ + " consecutive empty polls after "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Stuck: have data but cannot reach expected count
+ if (consecutiveEmpty >= 5 && result.totalRows > 0) {
+ System.out.println(
+ " Stuck: "
+ + consecutiveEmpty
+ + " consecutive empty polls at "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Never received anything
+ if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) {
+ System.out.println(" No data received after " + consecutiveEmpty + " polls");
+ break;
+ }
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException ignored) {
+ }
+ continue;
+ }
+
+ consecutiveEmpty = 0;
+
+ for (SubscriptionMessage message : messages) {
+ for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) {
+ String tableName = dataSet.getTableName();
+ String databaseName = dataSet.getDatabaseName();
+ List columnNames = dataSet.getColumnNames();
+
+ while (dataSet.hasNext()) {
+ org.apache.tsfile.read.common.RowRecord record = dataSet.next();
+ result.totalRows++;
+ if (tableName != null) {
+ result.rowsPerTable.merge(tableName, 1, Integer::sum);
+ }
+ if (databaseName != null) {
+ result.rowsPerDatabase.merge(databaseName, 1, Integer::sum);
+ }
+ for (int i = 0; i < columnNames.size(); i++) {
+ result.seenColumns.add(columnNames.get(i));
+ }
+ if (result.totalRows <= 5) {
+ System.out.println(
+ " Row: time="
+ + record.getTimestamp()
+ + ", values="
+ + record.getFields()
+ + ", table="
+ + tableName
+ + ", database="
+ + databaseName);
+ }
+ }
+ }
+ if (commitMessages) {
+ consumer.commitSync(message);
+ }
+ }
+
+ System.out.println(
+ " Poll attempt "
+ + attempt
+ + ": totalRows="
+ + result.totalRows
+ + " / expected="
+ + expectedRows);
+
+ // Stop immediately if we exceeded the expected row count
+ if (expectedRows > 0 && result.totalRows > expectedRows) {
+ System.out.println(
+ " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows);
+ break;
+ }
+ }
+
+ return result;
+ }
+
+ // ============================
+ // Cleanup
+ // ============================
+
+ /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */
+ private static void cleanup(
+ ISubscriptionTablePullConsumer consumer, String topicName, String database) {
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopicTable(topicName);
+ deleteDatabase(database);
+ }
+
+ /** Clean up with multiple databases. */
+ private static void cleanup(
+ ISubscriptionTablePullConsumer consumer, String topicName, String... databases) {
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopicTable(topicName);
+ for (String db : databases) {
+ deleteDatabase(db);
+ }
+ }
+
+ // ============================
+ // Result & Assertions
+ // ============================
+
+ static class PollResult {
+ int totalRows = 0;
+ Map rowsPerTable = new HashMap<>();
+ Map rowsPerDatabase = new HashMap<>();
+ Set seenColumns = new HashSet<>();
+
+ @Override
+ public String toString() {
+ return "PollResult{totalRows="
+ + totalRows
+ + ", rowsPerTable="
+ + rowsPerTable
+ + ", rowsPerDatabase="
+ + rowsPerDatabase
+ + ", seenColumns="
+ + seenColumns
+ + "}";
+ }
+ }
+
+ private static void assertEquals(String msg, int expected, int actual) {
+ if (expected != actual) {
+ throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual);
+ }
+ }
+
+ private static void assertTrue(String msg, boolean condition) {
+ if (!condition) {
+ throw new AssertionError(msg);
+ }
+ }
+
+ private static void assertAtLeast(String msg, int min, int actual) {
+ if (actual < min) {
+ throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual);
+ }
+ }
+
+ // ======================================================================
+ // Test 1: Basic Flow (merged: BasicDataDelivery + MultiTables + Flush)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Data written BEFORE subscribe is NOT received
+ *
- Multiple tables (t1, t2, t3) written AFTER subscribe are all received
+ *
- Flush does not cause data loss (WAL pinning keeps entries available)
+ *
- Exact row count matches expectation
+ *
+ */
+ private static void testBasicFlow() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ // Step 1: Write initial data to create DataRegion (should NOT be received)
+ System.out.println(" Step 1: Writing initial data (should NOT be received)");
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)");
+ for (int i = 0; i < 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Step 2: Create topic and subscribe
+ System.out.println(" Step 2: Creating topic and subscribing");
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Step 3: Write to 3 tables (30 rows each = 90 total), then flush
+ System.out.println(" Step 3: Writing 30 rows x 3 tables AFTER subscribe, then flush");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 100; i < 130; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i));
+ }
+ System.out.println(" Flushing...");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Step 4: Poll and verify
+ System.out.println(" Step 4: Polling...");
+ PollResult result = pollUntilComplete(consumer, 90, 100);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 90 rows (30 per table)", 90, result.totalRows);
+ if (!result.rowsPerTable.isEmpty()) {
+ System.out.println(" Rows per table: " + result.rowsPerTable);
+ for (String tbl : new String[] {"t1", "t2", "t3"}) {
+ Integer tblRows = result.rowsPerTable.get(tbl);
+ assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0);
+ }
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 2: Data Types (merged: MultipleDataTypes + MultiColumnTypes + CrossPartition)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Non-aligned: 6 data types via separate INSERTs
+ *
- All-column: 6 fields in a single INSERT
+ *
- Cross-partition: timestamps >1 week apart via SQL, Tablet methods
+ *
+ */
+ private static void testDataTypes() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+ final long GAP = 604_800_001L; // slightly over 1 week
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(
+ session,
+ database,
+ "t1",
+ "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, "
+ + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, "
+ + "s_text TEXT FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ // Init row to force DataRegion creation
+ session.executeNonQueryStatement(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ int totalExpected = 0;
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+
+ // --- Part A: 6 data types x 20 rows, separate INSERTs ---
+ System.out.println(" Part A: 6 data types x 20 rows (separate INSERTs)");
+ for (int i = 1; i <= 20; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', %d, %d)", i, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int64, time) VALUES ('d1', %d, %d)",
+ (long) i * 100000L, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_float, time) VALUES ('d1', %f, %d)", i * 1.1f, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_double, time) VALUES ('d1', %f, %d)", i * 2.2, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_bool, time) VALUES ('d1', %s, %d)",
+ i % 2 == 0 ? "true" : "false", i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_text, time) VALUES ('d1', 'text_%d', %d)", i, i));
+ }
+ totalExpected += 120; // 6 types x 20 rows
+
+ // --- Part B: All-column rows (50 rows) ---
+ System.out.println(" Part B: 50 all-column rows");
+ for (int i = 21; i <= 70; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)"
+ + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)",
+ i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i));
+ }
+ totalExpected += 50;
+
+ // --- Part C: Cross-partition writes ---
+ System.out.println(" Part C: Cross-partition (SQL single, multi, Tablet)");
+ long baseTs = 1_000_000_000L;
+
+ // SQL single-row x2
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'xp_single_1', %d)",
+ baseTs));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'xp_single_2', %d)",
+ baseTs + GAP));
+ totalExpected += 2;
+
+ // SQL multi-row x3
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'xp_multi_1', %d), "
+ + "('d1', 4, 400, 4.4, 4.44, false, 'xp_multi_2', %d), "
+ + "('d1', 5, 500, 5.5, 5.55, true, 'xp_multi_3', %d)",
+ baseTs + GAP * 2, baseTs + GAP * 3, baseTs + GAP * 4));
+ totalExpected += 3;
+
+ // Tablet x4
+ List schemaList = new ArrayList<>();
+ schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING));
+ schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32));
+ schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64));
+ schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
+ schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
+ schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
+ schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING));
+
+ List categories =
+ java.util.Arrays.asList(
+ ColumnCategory.TAG,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD);
+
+ Tablet tablet =
+ new Tablet(
+ "t1",
+ IMeasurementSchema.getMeasurementNameList(schemaList),
+ IMeasurementSchema.getDataTypeList(schemaList),
+ categories,
+ 10);
+ for (int i = 0; i < 4; i++) {
+ int row = tablet.getRowSize();
+ long ts = baseTs + GAP * (5 + i);
+ tablet.addTimestamp(row, ts);
+ tablet.addValue("tag1", row, "d1");
+ tablet.addValue("s_int32", row, 6 + i);
+ tablet.addValue("s_int64", row, (long) (600 + i * 100));
+ tablet.addValue("s_float", row, (6 + i) * 1.1f);
+ tablet.addValue("s_double", row, (6 + i) * 2.22);
+ tablet.addValue("s_bool", row, i % 2 == 0);
+ tablet.addValue("s_text", row, "xp_tablet_" + (i + 1));
+ }
+ session.insert(tablet);
+ totalExpected += 4;
+ }
+
+ System.out.println(" Total expected rows: " + totalExpected);
+ Thread.sleep(2000);
+
+ PollResult result = pollUntilComplete(consumer, totalExpected, 200);
+ System.out.println(" Result: " + result);
+
+ assertAtLeast(
+ "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows);
+ assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size());
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 3: Path Filtering (merged: TableLevel + DatabaseLevel)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Table-level: topic on table=t1 does NOT deliver t2 data
+ *
- Database-level: topic on db1 does NOT deliver db2 data
+ *
+ */
+ private static void testPathFiltering() throws Exception {
+ String database1 = nextDatabase();
+ String database2 = database1 + "_other";
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ // db1 with t1 and t2
+ createDatabaseAndTable(session, database1, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database1);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ // db2 with t1
+ createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database2);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic: only db1, only table t1
+ createTopicTable(topicName, database1, "t1");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing to db1.t1, db1.t2, db2.t1 (topic filter: db1.t1 only)");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database1);
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ }
+ session.executeNonQueryStatement("USE " + database2);
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting only db1.t1 data = 50 rows)...");
+ PollResult result = pollUntilComplete(consumer, 50, 60);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 50 rows from db1.t1 only", 50, result.totalRows);
+ if (!result.rowsPerTable.isEmpty()) {
+ Integer t2Rows = result.rowsPerTable.get("t2");
+ assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0);
+ System.out.println(" Table filtering verified: t1 only");
+ }
+ if (!result.rowsPerDatabase.isEmpty()) {
+ Integer db2Rows = result.rowsPerDatabase.get(database2);
+ assertTrue("Expected NO rows from " + database2, db2Rows == null || db2Rows == 0);
+ System.out.println(" Database filtering verified: " + database1 + " only");
+ }
+ } finally {
+ cleanup(consumer, topicName, database1, database2);
+ }
+ }
+
+ // ======================================================================
+ // Test 4: Subscribe Before Region Creation (kept as-is)
+ // ======================================================================
+ /**
+ * Subscribe BEFORE the database/region exists, then create database and write. Tests the
+ * IoTConsensus.onNewPeerCreated auto-binding path with table model.
+ */
+ private static void testSubscribeBeforeRegion() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ System.out.println(" Step 1: Creating topic BEFORE database exists");
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ System.out.println(" Step 2: Subscribing (no DataRegion exists yet)");
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Step 3: Creating database, table and writing data (100 rows)");
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 0; i < 100; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ }
+ Thread.sleep(5000);
+
+ System.out.println(" Step 4: Polling...");
+ PollResult result = pollUntilComplete(consumer, 100, 100);
+ System.out.println(" Result: " + result);
+
+ if (result.totalRows >= 100) {
+ System.out.println(" Auto-binding works! All " + result.totalRows + " rows received.");
+ } else if (result.totalRows > 0) {
+ System.out.println(
+ " Partial: " + result.totalRows + "/100 rows. First writes may precede binding.");
+ } else {
+ System.out.println(" No data received. Check logs for auto-binding messages.");
+ }
+ assertAtLeast(
+ "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 5: Redelivery / At-Least-Once (kept as-is from testPollWithoutCommit)
+ // ======================================================================
+ /** Tests at-least-once delivery with a mixed commit/no-commit pattern. */
+ private static void testRedelivery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ final int totalRows = 50;
+ System.out.println(" Writing " + totalRows + " rows");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= totalRows; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ }
+ Thread.sleep(3000);
+
+ int totalRowsCommitted = 0;
+ int roundNumber = 0;
+ boolean hasPending = false;
+ List pendingTimestamps = new ArrayList<>();
+ Set allCommittedTimestamps = new HashSet<>();
+ int redeliveryCount = 0;
+
+ for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(5000));
+ if (msgs.isEmpty()) {
+ Thread.sleep(1000);
+ continue;
+ }
+
+ for (SubscriptionMessage msg : msgs) {
+ List currentTimestamps = new ArrayList<>();
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ currentTimestamps.add(ds.next().getTimestamp());
+ }
+ }
+ assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0);
+
+ if (hasPending) {
+ assertTrue(
+ "Re-delivery timestamp list mismatch: expected="
+ + pendingTimestamps
+ + ", actual="
+ + currentTimestamps,
+ currentTimestamps.equals(pendingTimestamps));
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ hasPending = false;
+ redeliveryCount++;
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] Re-delivered & committed: timestamps="
+ + currentTimestamps);
+ } else {
+ if (totalRowsCommitted > 0) {
+ boolean overlap = false;
+ for (Long ts : currentTimestamps) {
+ if (allCommittedTimestamps.contains(ts)) {
+ overlap = true;
+ break;
+ }
+ }
+ assertTrue(
+ "After commit, should receive different data (overlap detected)", !overlap);
+ }
+
+ if (roundNumber % 2 == 0) {
+ pendingTimestamps = new ArrayList<>(currentTimestamps);
+ hasPending = true;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (NOT committed): timestamps="
+ + currentTimestamps);
+ } else {
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (committed directly): timestamps="
+ + currentTimestamps);
+ }
+ }
+ }
+ }
+
+ assertEquals("Should have committed all rows", totalRows, totalRowsCommitted);
+ assertTrue(
+ "Should have at least 1 re-delivery round (got " + redeliveryCount + ")",
+ redeliveryCount > 0);
+
+ System.out.println(" Final poll: expecting no data");
+ int extraRows = 0;
+ for (int i = 0; i < 3; i++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ for (SubscriptionMessage msg : msgs) {
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ extraRows++;
+ }
+ }
+ }
+ }
+ assertEquals("After all committed, should receive no more data", 0, extraRows);
+ System.out.println(
+ " At-least-once re-delivery verified: "
+ + totalRows
+ + " rows committed with "
+ + redeliveryCount
+ + " re-delivery rounds");
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Two consumer groups on same topic: each group gets ALL data independently
+ *
- One consumer subscribes to two topics with different TABLE_KEY filters: each topic
+ * delivers only matching data
+ *
+ */
+ private static void testMultiEntityIsolation() throws Exception {
+ String database = nextDatabase();
+ String topicName1 = "topic_tbl_multi_" + testCounter + "_a";
+ String topicName2 = "topic_tbl_multi_" + testCounter + "_b";
+ String consumerGroupId1 = "cg_tbl_multi_" + testCounter + "_a";
+ String consumerId1 = "consumer_tbl_multi_" + testCounter + "_a";
+ String consumerGroupId2 = "cg_tbl_multi_" + testCounter + "_b";
+ String consumerId2 = "consumer_tbl_multi_" + testCounter + "_b";
+ ISubscriptionTablePullConsumer consumer1 = null;
+ ISubscriptionTablePullConsumer consumer2 = null;
+
+ try {
+ // Setup: database with t1 and t2
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic 1: covers t1 only, Topic 2: covers t2 only
+ createTopicTable(topicName1, database, "t1");
+ createTopicTable(topicName2, database, "t2");
+ Thread.sleep(1000);
+
+ // Consumer 1 (group A): subscribes to BOTH topics
+ consumer1 = createConsumer(consumerId1, consumerGroupId1);
+ consumer1.subscribe(topicName1, topicName2);
+ // Consumer 2 (group B): subscribes to BOTH topics
+ consumer2 = createConsumer(consumerId2, consumerGroupId2);
+ consumer2.subscribe(topicName1, topicName2);
+ Thread.sleep(3000);
+
+ // Write 30 rows to t1, 40 rows to t2
+ System.out.println(" Writing 30 rows to t1, 40 rows to t2");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= 40; i++) {
+ if (i <= 30) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Part A: Both groups should get 70 rows independently
+ System.out.println(" Part A: Multi-group isolation");
+ System.out.println(" Polling from group 1...");
+ PollResult result1 = pollUntilComplete(consumer1, 70, 80);
+ System.out.println(" Group 1 result: " + result1);
+
+ System.out.println(" Polling from group 2...");
+ PollResult result2 = pollUntilComplete(consumer2, 70, 80);
+ System.out.println(" Group 2 result: " + result2);
+
+ assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows);
+ assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows);
+
+ // Part B: Verify per-topic table isolation
+ if (!result1.rowsPerTable.isEmpty()) {
+ Integer t1Rows = result1.rowsPerTable.get("t1");
+ Integer t2Rows = result1.rowsPerTable.get("t2");
+ assertEquals("Expected 30 rows from t1 (topic1)", 30, t1Rows != null ? t1Rows : 0);
+ assertEquals("Expected 40 rows from t2 (topic2)", 40, t2Rows != null ? t2Rows : 0);
+ System.out.println(" Multi-topic isolation verified: t1=" + t1Rows + ", t2=" + t2Rows);
+ }
+ System.out.println(
+ " Multi-group isolation verified: group1="
+ + result1.totalRows
+ + ", group2="
+ + result2.totalRows);
+ } finally {
+ if (consumer1 != null) {
+ try {
+ consumer1.unsubscribe(topicName1, topicName2);
+ } catch (Exception e) {
+ /* ignore */
+ }
+ try {
+ consumer1.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ if (consumer2 != null) {
+ try {
+ consumer2.unsubscribe(topicName1, topicName2);
+ } catch (Exception e) {
+ /* ignore */
+ }
+ try {
+ consumer2.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ dropTopicTable(topicName1);
+ dropTopicTable(topicName2);
+ deleteDatabase(database);
+ }
+ }
+
+ // ======================================================================
+ // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix)
+ // ======================================================================
+ /**
+ * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The
+ * pending queue overflow triggers gaps, which should be recovered from WAL.
+ *
+ * Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one
+ * {@code pendingEntries.offer()}. A single {@code session.insert(tablet)} with N rows in one time
+ * partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To actually
+ * overflow, we need 4096+ individual write() calls arriving faster than the prefetch
+ * thread can drain. We achieve this with multiple concurrent writer threads, each performing
+ * individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate.
+ *
+ *
Note: Gap occurrence is inherently timing-dependent (race between writers and the
+ * prefetch drain loop). This test maximizes the probability by using concurrent threads, but
+ * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling
+ * from WAL" messages to confirm the gap path was exercised.
+ *
+ *
Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to
+ * the next prefetch iteration.
+ */
+ private static void testBurstWriteGapRecovery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Use multiple concurrent writer threads with individual SQL INSERTs.
+ // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer().
+ // With N threads writing concurrently, aggregate rate should exceed drain rate
+ // and overflow the 4096-capacity queue, creating gaps.
+ final int writerThreads = 4;
+ final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096
+ final int totalRows = writerThreads * rowsPerThread;
+ final AtomicInteger errorCount = new AtomicInteger(0);
+ final CountDownLatch startLatch = new CountDownLatch(1);
+ final CountDownLatch doneLatch = new CountDownLatch(writerThreads);
+
+ System.out.println(
+ " Burst writing "
+ + totalRows
+ + " rows via "
+ + writerThreads
+ + " concurrent threads ("
+ + rowsPerThread
+ + " individual SQL INSERTs each)");
+ System.out.println(
+ " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)");
+
+ ExecutorService executor = Executors.newFixedThreadPool(writerThreads);
+ for (int t = 0; t < writerThreads; t++) {
+ final int threadId = t;
+ final int startTs = threadId * rowsPerThread + 1;
+ executor.submit(
+ () -> {
+ try {
+ startLatch.await(); // all threads start at the same time
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 0; i < rowsPerThread; i++) {
+ int ts = startTs + i;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)",
+ (long) ts * 10, ts));
+ }
+ }
+ } catch (Exception e) {
+ System.out.println(" Writer thread " + threadId + " error: " + e.getMessage());
+ errorCount.incrementAndGet();
+ } finally {
+ doneLatch.countDown();
+ }
+ });
+ }
+
+ // Fire all threads simultaneously
+ startLatch.countDown();
+ doneLatch.await();
+ executor.shutdown();
+
+ if (errorCount.get() > 0) {
+ System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors");
+ }
+
+ // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes
+ System.out.println(
+ " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)...");
+ System.out.println(
+ " (Check server logs for 'gap detected' to confirm gap recovery was triggered)");
+ PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true);
+ System.out.println(" Result: " + result);
+
+ assertEquals(
+ "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)",
+ totalRows,
+ result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 8: Commit After Unsubscribe (NEW — tests H7 fix)
+ // ======================================================================
+ /**
+ * Tests that commit still works correctly after the consumer has unsubscribed (queue has been
+ * torn down). The commit routing should use metadata-based topic config check instead of runtime
+ * queue state.
+ *
+ *
Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue().
+ */
+ private static void testCommitAfterUnsubscribe() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write data
+ System.out.println(" Writing 50 rows");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Poll WITHOUT commit
+ System.out.println(" Polling WITHOUT commit...");
+ List uncommittedMessages = new ArrayList<>();
+ int polledRows = 0;
+ for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ if (msgs.isEmpty()) {
+ if (polledRows > 0) break;
+ Thread.sleep(500);
+ continue;
+ }
+ for (SubscriptionMessage msg : msgs) {
+ uncommittedMessages.add(msg);
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ polledRows++;
+ }
+ }
+ }
+ }
+ System.out.println(
+ " Polled "
+ + polledRows
+ + " rows, holding "
+ + uncommittedMessages.size()
+ + " uncommitted messages");
+ assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows);
+
+ // Unsubscribe (tears down the consensus queue)
+ System.out.println(" Unsubscribing (queue teardown)...");
+ consumer.unsubscribe(topicName);
+ Thread.sleep(2000);
+
+ // Now commit the previously polled messages — should NOT throw
+ System.out.println(
+ " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe...");
+ boolean commitSucceeded = true;
+ for (SubscriptionMessage msg : uncommittedMessages) {
+ try {
+ consumer.commitSync(msg);
+ } catch (Exception e) {
+ System.out.println(" Commit threw exception: " + e.getMessage());
+ commitSucceeded = false;
+ }
+ }
+
+ System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded);
+ System.out.println(" (Key: no exception crash, routing handled gracefully)");
+ } finally {
+ if (consumer != null) {
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ dropTopicTable(topicName);
+ deleteDatabase(database);
+ }
+ }
+}
diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java
new file mode 100644
index 0000000000000..501b789edd738
--- /dev/null
+++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java
@@ -0,0 +1,1314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb;
+
+import org.apache.iotdb.isession.ISession;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
+import org.apache.iotdb.session.Session;
+import org.apache.iotdb.session.subscription.SubscriptionTreeSession;
+import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer;
+import org.apache.iotdb.session.subscription.payload.SubscriptionMessage;
+import org.apache.iotdb.session.subscription.payload.SubscriptionSessionDataSet;
+
+import org.apache.tsfile.common.conf.TSFileConfig;
+import org.apache.tsfile.enums.TSDataType;
+import org.apache.tsfile.utils.Binary;
+import org.apache.tsfile.write.record.Tablet;
+import org.apache.tsfile.write.schema.IMeasurementSchema;
+import org.apache.tsfile.write.schema.MeasurementSchema;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/** TODO: move these manual tests into ITs */
+public class ConsensusSubscriptionTest {
+
+ private static final String HOST = "127.0.0.1";
+ private static final int PORT = 6667;
+ private static final String USER = "root";
+ private static final String PASSWORD = "root";
+
+ private static int testCounter = 0;
+ private static int passed = 0;
+ private static int failed = 0;
+ private static final List failedTests = new ArrayList<>();
+
+ public static void main(String[] args) throws Exception {
+ System.out.println("=== Consensus-Based Subscription Test Suite ===\n");
+
+ String targetTest = args.length > 0 ? args[0] : null;
+
+ if (targetTest == null || "testBasicFlow".equals(targetTest)) {
+ runTest("testBasicFlow", ConsensusSubscriptionTest::testBasicFlow);
+ }
+ if (targetTest == null || "testDataTypes".equals(targetTest)) {
+ runTest("testDataTypes", ConsensusSubscriptionTest::testDataTypes);
+ }
+ if (targetTest == null || "testPathFiltering".equals(targetTest)) {
+ runTest("testPathFiltering", ConsensusSubscriptionTest::testPathFiltering);
+ }
+ if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) {
+ runTest("testSubscribeBeforeRegion", ConsensusSubscriptionTest::testSubscribeBeforeRegion);
+ }
+ if (targetTest == null || "testRedelivery".equals(targetTest)) {
+ runTest("testRedelivery", ConsensusSubscriptionTest::testRedelivery);
+ }
+ if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) {
+ runTest("testMultiEntityIsolation", ConsensusSubscriptionTest::testMultiEntityIsolation);
+ }
+ if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) {
+ runTest("testBurstWriteGapRecovery", ConsensusSubscriptionTest::testBurstWriteGapRecovery);
+ }
+ if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) {
+ runTest("testCommitAfterUnsubscribe", ConsensusSubscriptionTest::testCommitAfterUnsubscribe);
+ }
+
+ // Summary
+ System.out.println("\n=== Test Suite Summary ===");
+ System.out.println("Passed: " + passed);
+ System.out.println("Failed: " + failed);
+ if (!failedTests.isEmpty()) {
+ System.out.println("Failed tests: " + failedTests);
+ }
+ System.out.println("=== Done ===");
+ }
+
+ // ============================
+ // Test Infrastructure
+ // ============================
+
+ @FunctionalInterface
+ interface TestMethod {
+ void run() throws Exception;
+ }
+
+ private static void runTest(String name, TestMethod test) {
+ System.out.println("\n" + "=================================================================");
+ System.out.println("Running: " + name);
+ System.out.println("=================================================================");
+ try {
+ test.run();
+ passed++;
+ System.out.println(">>> PASSED: " + name);
+ } catch (AssertionError e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> FAILED: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ } catch (Exception e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> ERROR: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ }
+ }
+
+ private static String nextDatabase() {
+ testCounter++;
+ return "root.csub_test_" + testCounter;
+ }
+
+ private static String nextTopic() {
+ return "topic_csub_" + testCounter;
+ }
+
+ private static String nextConsumerGroup() {
+ return "cg_csub_" + testCounter;
+ }
+
+ private static String nextConsumerId() {
+ return "consumer_csub_" + testCounter;
+ }
+
+ private static ISession openSession() throws Exception {
+ ISession session =
+ new Session.Builder().host(HOST).port(PORT).username(USER).password(PASSWORD).build();
+ session.open();
+ return session;
+ }
+
+ private static void createDatabase(ISession session, String database) throws Exception {
+ try {
+ session.executeNonQueryStatement("CREATE DATABASE " + database);
+ } catch (Exception e) {
+ // ignore if already exists
+ }
+ }
+
+ private static void deleteDatabase(String database) {
+ try (ISession session = openSession()) {
+ session.executeNonQueryStatement("DELETE DATABASE " + database);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void dropTopic(String topicName) {
+ try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) {
+ subSession.open();
+ subSession.dropTopic(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void createTopic(String topicName, String path) throws Exception {
+ try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) {
+ subSession.open();
+ try {
+ subSession.dropTopic(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+
+ Properties topicConfig = new Properties();
+ topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE);
+ topicConfig.put(
+ TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE);
+ topicConfig.put(TopicConstant.PATH_KEY, path);
+ subSession.createTopic(topicName, topicConfig);
+ System.out.println(" Created topic: " + topicName + " (path=" + path + ")");
+ }
+ }
+
+ private static SubscriptionTreePullConsumer createConsumer(
+ String consumerId, String consumerGroupId) throws Exception {
+ SubscriptionTreePullConsumer consumer =
+ new SubscriptionTreePullConsumer.Builder()
+ .host(HOST)
+ .port(PORT)
+ .consumerId(consumerId)
+ .consumerGroupId(consumerGroupId)
+ .autoCommit(false)
+ .buildPullConsumer();
+ consumer.open();
+ return consumer;
+ }
+
+ // ============================
+ // Polling & Verification
+ // ============================
+
+ /**
+ * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive
+ * empty rounds to verify no extra data arrives.
+ */
+ private static PollResult pollUntilComplete(
+ SubscriptionTreePullConsumer consumer, int expectedRows, int maxPollAttempts) {
+ return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true);
+ }
+
+ private static PollResult pollUntilComplete(
+ SubscriptionTreePullConsumer consumer,
+ int expectedRows,
+ int maxPollAttempts,
+ long pollTimeoutMs,
+ boolean commitMessages) {
+ PollResult result = new PollResult();
+ int consecutiveEmpty = 0;
+
+ for (int attempt = 1; attempt <= maxPollAttempts; attempt++) {
+ List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs));
+
+ if (messages.isEmpty()) {
+ consecutiveEmpty++;
+ // Normal completion: reached expected rows and verified quiescence
+ if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) {
+ System.out.println(
+ " Verified: "
+ + consecutiveEmpty
+ + " consecutive empty polls after "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Stuck: have data but cannot reach expected count
+ if (consecutiveEmpty >= 5 && result.totalRows > 0) {
+ System.out.println(
+ " Stuck: "
+ + consecutiveEmpty
+ + " consecutive empty polls at "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Never received anything
+ if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) {
+ System.out.println(" No data received after " + consecutiveEmpty + " polls");
+ break;
+ }
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException ignored) {
+ }
+ continue;
+ }
+
+ consecutiveEmpty = 0;
+
+ for (SubscriptionMessage message : messages) {
+ for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) {
+ String device = null;
+ List columnNames = dataSet.getColumnNames();
+ if (columnNames.size() > 1) {
+ String fullPath = columnNames.get(1);
+ int lastDot = fullPath.lastIndexOf('.');
+ device = lastDot > 0 ? fullPath.substring(0, lastDot) : fullPath;
+ }
+
+ while (dataSet.hasNext()) {
+ org.apache.tsfile.read.common.RowRecord record = dataSet.next();
+ result.totalRows++;
+ if (device != null) {
+ result.rowsPerDevice.merge(device, 1, Integer::sum);
+ }
+ for (int i = 1; i < columnNames.size(); i++) {
+ result.seenColumns.add(columnNames.get(i));
+ }
+ if (result.totalRows <= 5) {
+ System.out.println(
+ " Row: time="
+ + record.getTimestamp()
+ + ", values="
+ + record.getFields()
+ + ", device="
+ + device);
+ }
+ }
+ }
+ if (commitMessages) {
+ consumer.commitSync(message);
+ }
+ }
+
+ System.out.println(
+ " Poll attempt "
+ + attempt
+ + ": totalRows="
+ + result.totalRows
+ + " / expected="
+ + expectedRows);
+
+ // Stop immediately if we exceeded the expected row count
+ if (expectedRows > 0 && result.totalRows > expectedRows) {
+ System.out.println(
+ " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows);
+ break;
+ }
+ }
+
+ return result;
+ }
+
+ // ============================
+ // Cleanup
+ // ============================
+
+ /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */
+ private static void cleanup(
+ SubscriptionTreePullConsumer consumer, String topicName, String database) {
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopic(topicName);
+ deleteDatabase(database);
+ }
+
+ // ============================
+ // Result & Assertions
+ // ============================
+
+ static class PollResult {
+ int totalRows = 0;
+ Map rowsPerDevice = new HashMap<>();
+ Set seenColumns = new HashSet<>();
+
+ @Override
+ public String toString() {
+ return "PollResult{totalRows="
+ + totalRows
+ + ", rowsPerDevice="
+ + rowsPerDevice
+ + ", seenColumns="
+ + seenColumns
+ + "}";
+ }
+ }
+
+ private static void assertEquals(String msg, int expected, int actual) {
+ if (expected != actual) {
+ throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual);
+ }
+ }
+
+ private static void assertTrue(String msg, boolean condition) {
+ if (!condition) {
+ throw new AssertionError(msg);
+ }
+ }
+
+ private static void assertAtLeast(String msg, int min, int actual) {
+ if (actual < min) {
+ throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual);
+ }
+ }
+
+ // ======================================================================
+ // Test 1: Basic Flow (merged: BasicDataDelivery + MultiDevices + Flush)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Data written BEFORE subscribe is NOT received
+ *
- Multiple devices (d1, d2, d3) written AFTER subscribe are all received
+ *
- Flush does not cause data loss (WAL pinning keeps entries available)
+ *
- Exact row count matches expectation
+ *
+ */
+ private static void testBasicFlow() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ // Step 1: Write initial data to create DataRegion (should NOT be received)
+ System.out.println(" Step 1: Writing initial data (should NOT be received)");
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ for (int i = 0; i < 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ // Also write to d2, d3 for multi-device readiness
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Step 2: Create topic and subscribe
+ System.out.println(" Step 2: Creating topic and subscribing");
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Step 3: Write to 3 devices (30 rows each = 90 total), then flush
+ System.out.println(" Step 3: Writing 30 rows x 3 devices AFTER subscribe, then flush");
+ try (ISession session = openSession()) {
+ for (int i = 100; i < 130; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30));
+ }
+ System.out.println(" Flushing...");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Step 4: Poll and verify
+ System.out.println(" Step 4: Polling...");
+ PollResult result = pollUntilComplete(consumer, 90, 100);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 90 rows (30 per device)", 90, result.totalRows);
+ if (!result.rowsPerDevice.isEmpty()) {
+ System.out.println(" Rows per device: " + result.rowsPerDevice);
+ for (String dev : new String[] {"d1", "d2", "d3"}) {
+ Integer devRows = result.rowsPerDevice.get(database + "." + dev);
+ assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0);
+ }
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 2: Data Types (merged: MultipleDataTypes + Aligned + CrossPartition)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Non-aligned: 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT)
+ *
- Aligned: 6 data types, cross-partition timestamps (>1 week apart)
+ *
- 6 write methods: SQL single/multi-row, insertAlignedRecord/Records/Tablet/Tablets
+ *
+ */
+ private static void testDataTypes() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+ final long GAP = 604_800_001L; // slightly over 1 week
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ // Create aligned timeseries
+ session.executeNonQueryStatement(
+ String.format(
+ "CREATE ALIGNED TIMESERIES %s.d_aligned"
+ + "(s_int32 INT32, s_int64 INT64, s_float FLOAT,"
+ + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)",
+ database));
+ // Init rows to force DataRegion creation
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s_int32) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')",
+ database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ int totalExpected = 0;
+ final String device = database + ".d_aligned";
+ List measurements =
+ Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text");
+ List types =
+ Arrays.asList(
+ TSDataType.INT32,
+ TSDataType.INT64,
+ TSDataType.FLOAT,
+ TSDataType.DOUBLE,
+ TSDataType.BOOLEAN,
+ TSDataType.TEXT);
+ List schemas = new ArrayList<>();
+ schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32));
+ schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64));
+ schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
+ schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
+ schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
+ schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT));
+
+ try (ISession session = openSession()) {
+ // --- Part A: Non-aligned, 6 types x 20 rows ---
+ System.out.println(" Part A: Non-aligned 6 data types x 20 rows");
+ for (int i = 1; i <= 20; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s_int32) VALUES (%d, %d)", database, i, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_int64) VALUES (%d, %d)",
+ database, i, (long) i * 100000L));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_float) VALUES (%d, %f)", database, i, i * 1.1f));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_double) VALUES (%d, %f)", database, i, i * 2.2));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_bool) VALUES (%d, %s)",
+ database, i, i % 2 == 0 ? "true" : "false"));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_text) VALUES (%d, 'text_%d')", database, i, i));
+ }
+ totalExpected += 120; // 6 types x 20 rows
+
+ // --- Part B: Aligned cross-partition, 6 write methods ---
+ System.out.println(" Part B: Aligned cross-partition, 6 write methods");
+
+ // Method 1: SQL single row
+ long t1 = 1;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')",
+ database, t1));
+ totalExpected += 1;
+
+ // Method 2: SQL multi-row (cross-partition)
+ long t2a = 1 + GAP;
+ long t2b = 1 + 2 * GAP;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a'),"
+ + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')",
+ database, t2a, t2b));
+ totalExpected += 2;
+
+ // Method 3: insertAlignedRecord
+ long t3 = 1 + 3 * GAP;
+ session.insertAlignedRecord(
+ device,
+ t3,
+ measurements,
+ types,
+ Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single"));
+ totalExpected += 1;
+
+ // Method 4: insertAlignedRecordsOfOneDevice (cross-partition)
+ long t4a = 1 + 4 * GAP;
+ long t4b = 1 + 5 * GAP;
+ session.insertAlignedRecordsOfOneDevice(
+ device,
+ Arrays.asList(t4a, t4b),
+ Arrays.asList(measurements, measurements),
+ Arrays.asList(types, types),
+ Arrays.asList(
+ Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"),
+ Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b")));
+ totalExpected += 2;
+
+ // Method 5: insertAlignedTablet (cross-partition)
+ long t5a = 1 + 6 * GAP;
+ long t5b = 1 + 7 * GAP;
+ Tablet tablet5 = new Tablet(device, schemas, 2);
+ addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a");
+ addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b");
+ session.insertAlignedTablet(tablet5);
+ totalExpected += 2;
+
+ // Method 6: insertAlignedTablets (cross-partition)
+ long t6a = 1 + 8 * GAP;
+ long t6b = 1 + 9 * GAP;
+ Tablet tablet6 = new Tablet(device, schemas, 2);
+ addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a");
+ addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b");
+ Map tabletMap = new HashMap<>();
+ tabletMap.put(device, tablet6);
+ session.insertAlignedTablets(tabletMap);
+ totalExpected += 2;
+ }
+
+ System.out.println(" Total expected rows: " + totalExpected);
+ Thread.sleep(2000);
+
+ PollResult result = pollUntilComplete(consumer, totalExpected, 150);
+ System.out.println(" Result: " + result);
+
+ assertAtLeast(
+ "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows);
+ assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size());
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 3: Path Filtering (merged: DeviceLevel + TimeseriesLevel)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Device-level: topic on d1.** does NOT deliver d2 data
+ *
- Timeseries-level: topic on d1.s1 — lenient check for s2 filtering
+ *
+ */
+ private static void testPathFiltering() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1, s2) VALUES (0, 0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic filters d1.s1 only (timeseries-level)
+ String filterPath = database + ".d1.s1";
+ createTopic(topicName, filterPath);
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing to d1 (s1 + s2) and d2 (s1)");
+ try (ISession session = openSession()) {
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %d)",
+ database, i, i * 10, i * 20));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 30));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting d1 data only, ideally s1 only)...");
+ PollResult result = pollUntilComplete(consumer, 50, 60);
+ System.out.println(" Result: " + result);
+
+ // Device-level: d2 must NOT appear
+ if (!result.rowsPerDevice.isEmpty()) {
+ Integer d2Rows = result.rowsPerDevice.get(database + ".d2");
+ assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0);
+ Integer d1Rows = result.rowsPerDevice.get(database + ".d1");
+ assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0);
+ System.out.println(" Device filtering verified: d1=" + d1Rows + ", d2=" + d2Rows);
+ }
+
+ // Timeseries-level: lenient check
+ boolean hasS2 = result.seenColumns.stream().anyMatch(c -> c.contains(".s2"));
+ if (hasS2) {
+ System.out.println(
+ " INFO: Both s1 and s2 received — converter uses device-level filtering only.");
+ assertAtLeast("Should have received d1 rows", 50, result.totalRows);
+ } else {
+ System.out.println(" Timeseries-level filtering verified: only s1 data received");
+ assertEquals("Expected exactly 50 rows from d1.s1 only", 50, result.totalRows);
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 4: Subscribe Before Region Creation (kept as-is)
+ // ======================================================================
+ /**
+ * Subscribe BEFORE the database/region exists, then create database and write. Tests the
+ * IoTConsensus.onNewPeerCreated auto-binding path.
+ */
+ private static void testSubscribeBeforeRegion() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ System.out.println(" Step 1: Creating topic BEFORE database exists");
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ System.out.println(" Step 2: Subscribing (no DataRegion exists yet)");
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Step 3: Creating database and writing data (100 rows)");
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ for (int i = 0; i < 100; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ }
+ Thread.sleep(5000);
+
+ System.out.println(" Step 4: Polling...");
+ PollResult result = pollUntilComplete(consumer, 100, 100);
+ System.out.println(" Result: " + result);
+
+ if (result.totalRows >= 100) {
+ System.out.println(" Auto-binding works! All " + result.totalRows + " rows received.");
+ } else if (result.totalRows > 0) {
+ System.out.println(
+ " Partial: " + result.totalRows + "/100 rows. First writes may precede binding.");
+ } else {
+ System.out.println(" No data received. Check logs for auto-binding messages.");
+ }
+ assertAtLeast(
+ "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 5: Redelivery / At-Least-Once (kept as-is from testPollWithoutCommit)
+ // ======================================================================
+ /**
+ * Tests at-least-once delivery with a mixed commit/no-commit pattern.
+ *
+ * Writes 50 rows. Alternates between:
+ *
+ *
+ * - Even rounds: poll WITHOUT commit → next poll verifies same timestamps → commit
+ *
- Odd rounds: poll and commit directly → next poll should deliver DIFFERENT data
+ *
+ */
+ private static void testRedelivery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ final int totalRows = 50;
+ System.out.println(" Writing " + totalRows + " rows");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= totalRows; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ }
+ Thread.sleep(3000);
+
+ int totalRowsCommitted = 0;
+ int roundNumber = 0;
+ boolean hasPending = false;
+ List pendingTimestamps = new ArrayList<>();
+ Set allCommittedTimestamps = new HashSet<>();
+ int redeliveryCount = 0;
+
+ for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(5000));
+ if (msgs.isEmpty()) {
+ Thread.sleep(1000);
+ continue;
+ }
+
+ for (SubscriptionMessage msg : msgs) {
+ List currentTimestamps = new ArrayList<>();
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ currentTimestamps.add(ds.next().getTimestamp());
+ }
+ }
+ assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0);
+
+ if (hasPending) {
+ // Re-delivery round: verify EXACT same timestamps
+ assertTrue(
+ "Re-delivery timestamp list mismatch: expected="
+ + pendingTimestamps
+ + ", actual="
+ + currentTimestamps,
+ currentTimestamps.equals(pendingTimestamps));
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ hasPending = false;
+ redeliveryCount++;
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] Re-delivered & committed: timestamps="
+ + currentTimestamps);
+ } else {
+ // New event round
+ if (totalRowsCommitted > 0) {
+ boolean overlap = false;
+ for (Long ts : currentTimestamps) {
+ if (allCommittedTimestamps.contains(ts)) {
+ overlap = true;
+ break;
+ }
+ }
+ assertTrue(
+ "After commit, should receive different data (overlap detected)", !overlap);
+ }
+
+ if (roundNumber % 2 == 0) {
+ pendingTimestamps = new ArrayList<>(currentTimestamps);
+ hasPending = true;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (NOT committed): timestamps="
+ + currentTimestamps);
+ } else {
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (committed directly): timestamps="
+ + currentTimestamps);
+ }
+ }
+ }
+ }
+
+ assertEquals("Should have committed all rows", totalRows, totalRowsCommitted);
+ assertTrue(
+ "Should have at least 1 re-delivery round (got " + redeliveryCount + ")",
+ redeliveryCount > 0);
+
+ // Final poll: should be empty
+ System.out.println(" Final poll: expecting no data");
+ int extraRows = 0;
+ for (int i = 0; i < 3; i++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ for (SubscriptionMessage msg : msgs) {
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ extraRows++;
+ }
+ }
+ }
+ }
+ assertEquals("After all committed, should receive no more data", 0, extraRows);
+ System.out.println(
+ " At-least-once re-delivery verified: "
+ + totalRows
+ + " rows committed with "
+ + redeliveryCount
+ + " re-delivery rounds");
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic)
+ // ======================================================================
+ /**
+ * Verifies:
+ *
+ *
+ * - Two consumer groups on same topic: each group gets ALL data independently
+ *
- One consumer subscribes to two topics with different path filters: each topic delivers
+ * only matching data
+ *
+ */
+ private static void testMultiEntityIsolation() throws Exception {
+ String database = nextDatabase();
+ String topicName1 = "topic_multi_" + testCounter + "_a";
+ String topicName2 = "topic_multi_" + testCounter + "_b";
+ String consumerGroupId1 = "cg_multi_" + testCounter + "_a";
+ String consumerId1 = "consumer_multi_" + testCounter + "_a";
+ String consumerGroupId2 = "cg_multi_" + testCounter + "_b";
+ String consumerId2 = "consumer_multi_" + testCounter + "_b";
+ SubscriptionTreePullConsumer consumer1 = null;
+ SubscriptionTreePullConsumer consumer2 = null;
+
+ try {
+ // Setup: database with d1 and d2
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic 1: covers d1 only, Topic 2: covers d2 only
+ createTopic(topicName1, database + ".d1.**");
+ createTopic(topicName2, database + ".d2.**");
+ Thread.sleep(1000);
+
+ // Consumer 1 (group A): subscribes to BOTH topics
+ consumer1 = createConsumer(consumerId1, consumerGroupId1);
+ consumer1.subscribe(topicName1, topicName2);
+ // Consumer 2 (group B): subscribes to BOTH topics
+ consumer2 = createConsumer(consumerId2, consumerGroupId2);
+ consumer2.subscribe(topicName1, topicName2);
+ Thread.sleep(3000);
+
+ // Write 30 rows to d1, 40 rows to d2
+ System.out.println(" Writing 30 rows to d1, 40 rows to d2");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= 40; i++) {
+ if (i <= 30) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Part A: Both groups should get 70 rows independently
+ System.out.println(" Part A: Multi-group isolation");
+ System.out.println(" Polling from group 1...");
+ PollResult result1 = pollUntilComplete(consumer1, 70, 80);
+ System.out.println(" Group 1 result: " + result1);
+
+ System.out.println(" Polling from group 2...");
+ PollResult result2 = pollUntilComplete(consumer2, 70, 80);
+ System.out.println(" Group 2 result: " + result2);
+
+ assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows);
+ assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows);
+
+ // Part B: Verify per-topic device isolation
+ if (!result1.rowsPerDevice.isEmpty()) {
+ Integer d1Rows = result1.rowsPerDevice.get(database + ".d1");
+ Integer d2Rows = result1.rowsPerDevice.get(database + ".d2");
+ assertEquals("Expected 30 rows from d1 (topic1)", 30, d1Rows != null ? d1Rows : 0);
+ assertEquals("Expected 40 rows from d2 (topic2)", 40, d2Rows != null ? d2Rows : 0);
+ System.out.println(" Multi-topic isolation verified: d1=" + d1Rows + ", d2=" + d2Rows);
+ }
+ System.out.println(
+ " Multi-group isolation verified: group1="
+ + result1.totalRows
+ + ", group2="
+ + result2.totalRows);
+ } finally {
+ if (consumer1 != null) {
+ try {
+ consumer1.unsubscribe(topicName1, topicName2);
+ } catch (Exception e) {
+ /* ignore */
+ }
+ try {
+ consumer1.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ if (consumer2 != null) {
+ try {
+ consumer2.unsubscribe(topicName1, topicName2);
+ } catch (Exception e) {
+ /* ignore */
+ }
+ try {
+ consumer2.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ dropTopic(topicName1);
+ dropTopic(topicName2);
+ deleteDatabase(database);
+ }
+ }
+
+ // ======================================================================
+ // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix)
+ // ======================================================================
+ /**
+ * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The
+ * pending queue overflow triggers gaps, which should be recovered from WAL.
+ *
+ * Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one
+ * {@code pendingEntries.offer()}. A single {@code session.insertTablet(tablet)} with N rows in
+ * one time partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To
+ * actually overflow, we need 4096+ individual write() calls arriving faster than the
+ * prefetch thread can drain. We achieve this with multiple concurrent writer threads, each
+ * performing individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate.
+ *
+ *
Note: Gap occurrence is inherently timing-dependent (race between writers and the
+ * prefetch drain loop). This test maximizes the probability by using concurrent threads, but
+ * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling
+ * from WAL" messages to confirm the gap path was exercised.
+ *
+ *
Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to
+ * the next prefetch iteration.
+ */
+ private static void testBurstWriteGapRecovery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Use multiple concurrent writer threads with individual SQL INSERTs.
+ // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer().
+ // With N threads writing concurrently, aggregate rate should exceed drain rate
+ // and overflow the 4096-capacity queue, creating gaps.
+ final int writerThreads = 4;
+ final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096
+ final int totalRows = writerThreads * rowsPerThread;
+ final AtomicInteger errorCount = new AtomicInteger(0);
+ final CountDownLatch startLatch = new CountDownLatch(1);
+ final CountDownLatch doneLatch = new CountDownLatch(writerThreads);
+
+ System.out.println(
+ " Burst writing "
+ + totalRows
+ + " rows via "
+ + writerThreads
+ + " concurrent threads ("
+ + rowsPerThread
+ + " individual SQL INSERTs each)");
+ System.out.println(
+ " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)");
+
+ ExecutorService executor = Executors.newFixedThreadPool(writerThreads);
+ for (int t = 0; t < writerThreads; t++) {
+ final int threadId = t;
+ final int startTs = threadId * rowsPerThread + 1;
+ executor.submit(
+ () -> {
+ try {
+ startLatch.await(); // all threads start at the same time
+ try (ISession session = openSession()) {
+ for (int i = 0; i < rowsPerThread; i++) {
+ int ts = startTs + i;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s1) VALUES (%d, %d)",
+ database, ts, (long) ts * 10));
+ }
+ }
+ } catch (Exception e) {
+ System.out.println(" Writer thread " + threadId + " error: " + e.getMessage());
+ errorCount.incrementAndGet();
+ } finally {
+ doneLatch.countDown();
+ }
+ });
+ }
+
+ // Fire all threads simultaneously
+ startLatch.countDown();
+ doneLatch.await();
+ executor.shutdown();
+
+ if (errorCount.get() > 0) {
+ System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors");
+ }
+
+ // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes
+ System.out.println(
+ " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)...");
+ System.out.println(
+ " (Check server logs for 'gap detected' to confirm gap recovery was triggered)");
+ PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true);
+ System.out.println(" Result: " + result);
+
+ assertEquals(
+ "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)",
+ totalRows,
+ result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ======================================================================
+ // Test 8: Commit After Unsubscribe (NEW — tests H7 fix)
+ // ======================================================================
+ /**
+ * Tests that commit still works correctly after the consumer has unsubscribed (queue has been
+ * torn down). The commit routing should use metadata-based topic config check instead of runtime
+ * queue state.
+ *
+ *
Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue().
+ */
+ private static void testCommitAfterUnsubscribe() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write data
+ System.out.println(" Writing 50 rows");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Poll WITHOUT commit
+ System.out.println(" Polling WITHOUT commit...");
+ List uncommittedMessages = new ArrayList<>();
+ int polledRows = 0;
+ for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ if (msgs.isEmpty()) {
+ if (polledRows > 0) break;
+ Thread.sleep(500);
+ continue;
+ }
+ for (SubscriptionMessage msg : msgs) {
+ uncommittedMessages.add(msg);
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ polledRows++;
+ }
+ }
+ }
+ }
+ System.out.println(
+ " Polled "
+ + polledRows
+ + " rows, holding "
+ + uncommittedMessages.size()
+ + " uncommitted messages");
+ assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows);
+
+ // Unsubscribe (tears down the consensus queue)
+ System.out.println(" Unsubscribing (queue teardown)...");
+ consumer.unsubscribe(topicName);
+ Thread.sleep(2000);
+
+ // Now commit the previously polled messages — should NOT throw
+ System.out.println(
+ " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe...");
+ boolean commitSucceeded = true;
+ for (SubscriptionMessage msg : uncommittedMessages) {
+ try {
+ consumer.commitSync(msg);
+ } catch (Exception e) {
+ System.out.println(" Commit threw exception: " + e.getMessage());
+ commitSucceeded = false;
+ }
+ }
+
+ // The commit may silently succeed or fail gracefully — the key is no crash
+ System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded);
+ System.out.println(" (Key: no exception crash, routing handled gracefully)");
+ } finally {
+ if (consumer != null) {
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ dropTopic(topicName);
+ deleteDatabase(database);
+ }
+ }
+
+ /** Helper: populate one row of an aligned Tablet with all 6 data types. */
+ private static void addAlignedTabletRow(
+ Tablet tablet,
+ int rowIndex,
+ long timestamp,
+ int intVal,
+ long longVal,
+ float floatVal,
+ double doubleVal,
+ boolean boolVal,
+ String textVal) {
+ tablet.addTimestamp(rowIndex, timestamp);
+ tablet.addValue("s_int32", rowIndex, intVal);
+ tablet.addValue("s_int64", rowIndex, longVal);
+ tablet.addValue("s_float", rowIndex, floatVal);
+ tablet.addValue("s_double", rowIndex, doubleVal);
+ tablet.addValue("s_bool", rowIndex, boolVal);
+ tablet.addValue("s_text", rowIndex, new Binary(textVal, TSFileConfig.STRING_CHARSET));
+ }
+}
diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java
index cb5edd8cd91a3..6b71d5b16f79a 100644
--- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java
+++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java
@@ -39,6 +39,7 @@
import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq;
import org.apache.iotdb.consensus.exception.ConsensusException;
import org.apache.iotdb.rpc.TSStatusCode;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
import org.apache.iotdb.rpc.subscription.exception.SubscriptionException;
import org.apache.tsfile.utils.ReadWriteIOUtils;
@@ -52,6 +53,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
+import java.util.Set;
import java.util.stream.Collectors;
public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndPipeProcedure {
@@ -66,6 +68,8 @@ public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndP
private AlterConsumerGroupProcedure alterConsumerGroupProcedure;
private List createPipeProcedures = new ArrayList<>();
+ private Set consensusTopicNames = new HashSet<>();
+
// TODO: remove this variable later
private final List alterTopicProcedures = new ArrayList<>(); // unused now
@@ -103,15 +107,41 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env)
alterConsumerGroupProcedure =
new AlterConsumerGroupProcedure(updatedConsumerGroupMeta, subscriptionInfo);
- // Construct CreatePipeProcedureV2s
+ // Construct CreatePipeProcedureV2s (for non-consensus topics)
for (final String topicName : subscribeReq.getTopicNames()) {
+ final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName);
+
+ // Check if this topic should use consensus subscription: mode is live, format is Tablet
+ final String topicMode =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE);
+ final String topicFormat =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE);
+ final boolean isConsensusBasedTopic =
+ TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode)
+ && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat);
+
+ if (isConsensusBasedTopic) {
+ // skip pipe creation
+ consensusTopicNames.add(topicName);
+ LOGGER.info(
+ "CreateSubscriptionProcedure: topic [{}] uses consensus-based subscription "
+ + "(mode={}, format={}), skipping pipe creation",
+ topicName,
+ topicMode,
+ topicFormat);
+ continue;
+ }
+
final String pipeName =
PipeStaticMeta.generateSubscriptionPipeName(topicName, consumerGroupId);
if (!subscriptionInfo.get().isTopicSubscribedByConsumerGroup(topicName, consumerGroupId)
// even if there existed subscription meta, if there is no corresponding pipe meta, it
// will try to create the pipe
|| !pipeTaskInfo.get().isPipeExisted(pipeName)) {
- final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName);
createPipeProcedures.add(
new CreatePipeProcedureV2(
new TCreatePipeReq()
@@ -177,20 +207,29 @@ protected void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env)
// Push consumer group meta to data nodes
alterConsumerGroupProcedure.executeFromOperateOnDataNodes(env);
- // Push pipe meta to data nodes
- final List pipeNames =
- createPipeProcedures.stream()
- .map(CreatePipeProcedureV2::getPipeName)
- .collect(Collectors.toList());
- final String exceptionMessage =
- AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe(
- null, pushMultiPipeMetaToDataNodes(pipeNames, env));
- if (!exceptionMessage.isEmpty()) {
- // throw exception instead of logging warn, do not rely on metadata synchronization
- throw new SubscriptionException(
- String.format(
- "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.",
- pipeNames, subscribeReq, exceptionMessage));
+ if (!consensusTopicNames.isEmpty()) {
+ LOGGER.info(
+ "CreateSubscriptionProcedure: consensus-based topics {} will be handled by DataNode "
+ + "via consumer group meta push (no pipe creation needed)",
+ consensusTopicNames);
+ }
+
+ // Push pipe meta to data nodes (only for non-consensus pipe-based topics)
+ if (!createPipeProcedures.isEmpty()) {
+ final List pipeNames =
+ createPipeProcedures.stream()
+ .map(CreatePipeProcedureV2::getPipeName)
+ .collect(Collectors.toList());
+ final String exceptionMessage =
+ AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe(
+ null, pushMultiPipeMetaToDataNodes(pipeNames, env));
+ if (!exceptionMessage.isEmpty()) {
+ // throw exception instead of logging warn, do not rely on metadata synchronization
+ throw new SubscriptionException(
+ String.format(
+ "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.",
+ pipeNames, subscribeReq, exceptionMessage));
+ }
}
}
@@ -297,6 +336,12 @@ public void serialize(final DataOutputStream stream) throws IOException {
} else {
ReadWriteIOUtils.write(false, stream);
}
+
+ // Serialize consensus topic names
+ ReadWriteIOUtils.write(consensusTopicNames.size(), stream);
+ for (final String consensusTopicName : consensusTopicNames) {
+ ReadWriteIOUtils.write(consensusTopicName, stream);
+ }
}
@Override
@@ -348,6 +393,14 @@ public void deserialize(final ByteBuffer byteBuffer) {
}
}
}
+
+ // Deserialize consensus topic names
+ if (byteBuffer.hasRemaining()) {
+ size = ReadWriteIOUtils.readInt(byteBuffer);
+ for (int i = 0; i < size; ++i) {
+ consensusTopicNames.add(ReadWriteIOUtils.readString(byteBuffer));
+ }
+ }
}
@Override
@@ -364,7 +417,8 @@ public boolean equals(final Object o) {
&& getCycles() == that.getCycles()
&& Objects.equals(subscribeReq, that.subscribeReq)
&& Objects.equals(alterConsumerGroupProcedure, that.alterConsumerGroupProcedure)
- && Objects.equals(createPipeProcedures, that.createPipeProcedures);
+ && Objects.equals(createPipeProcedures, that.createPipeProcedures)
+ && Objects.equals(consensusTopicNames, that.consensusTopicNames);
}
@Override
@@ -375,7 +429,8 @@ public int hashCode() {
getCycles(),
subscribeReq,
alterConsumerGroupProcedure,
- createPipeProcedures);
+ createPipeProcedures,
+ consensusTopicNames);
}
@TestOnly
diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java
index 6741a6c1e2a84..99f8ed649d852 100644
--- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java
+++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java
@@ -22,6 +22,7 @@
import org.apache.iotdb.common.rpc.thrift.TSStatus;
import org.apache.iotdb.commons.pipe.agent.task.meta.PipeStaticMeta;
import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta;
+import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta;
import org.apache.iotdb.commons.utils.TestOnly;
import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan;
import org.apache.iotdb.confignode.consensus.request.write.pipe.task.DropPipePlanV2;
@@ -36,6 +37,7 @@
import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq;
import org.apache.iotdb.consensus.exception.ConsensusException;
import org.apache.iotdb.rpc.TSStatusCode;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
import org.apache.iotdb.rpc.subscription.exception.SubscriptionException;
import org.apache.tsfile.utils.ReadWriteIOUtils;
@@ -100,6 +102,31 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env)
for (final String topic : unsubscribeReq.getTopicNames()) {
if (topicsUnsubByGroup.contains(topic)) {
+ // Check if this topic uses consensus-based subscription (same detection as
+ // CreateSubscriptionProcedure). Consensus topics have no pipe to drop.
+ final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topic);
+ final String topicMode =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE);
+ final String topicFormat =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE);
+ final boolean isConsensusBasedTopic =
+ TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode)
+ && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat);
+
+ if (isConsensusBasedTopic) {
+ LOGGER.info(
+ "DropSubscriptionProcedure: topic [{}] is consensus-based (mode={}, format={}), "
+ + "skipping pipe removal",
+ topic,
+ topicMode,
+ topicFormat);
+ continue;
+ }
+
// Topic will be subscribed by no consumers in this group
dropPipeProcedures.add(
new DropPipeProcedureV2(
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
index 959191ca2d6d3..8cb168272b295 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
@@ -82,6 +82,7 @@
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.BiConsumer;
+import java.util.function.Consumer;
import java.util.stream.Collectors;
public class IoTConsensus implements IConsensus {
@@ -98,6 +99,19 @@ public class IoTConsensus implements IConsensus {
private final IoTConsensusRPCService service;
private final RegisterManager registerManager = new RegisterManager();
private IoTConsensusConfig config;
+
+ /**
+ * Optional callback invoked after a new local peer is created via {@link #createLocalPeer}. Used
+ * by the subscription system to auto-bind prefetching queues to new DataRegions.
+ */
+ public static volatile BiConsumer onNewPeerCreated;
+
+ /**
+ * Optional callback invoked before a local peer is deleted via {@link #deleteLocalPeer}. Used by
+ * the subscription system to unbind and clean up prefetching queues before the region is removed.
+ */
+ public static volatile Consumer onPeerRemoved;
+
private final IClientManager clientManager;
private final IClientManager syncClientManager;
private final ScheduledExecutorService backgroundTaskService;
@@ -299,11 +313,33 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers)
if (exist.get()) {
throw new ConsensusGroupAlreadyExistException(groupId);
}
+
+ // Notify subscription system about new peer creation for auto-binding
+ final BiConsumer callback = onNewPeerCreated;
+ if (callback != null) {
+ try {
+ callback.accept(groupId, stateMachineMap.get(groupId));
+ } catch (final Exception e) {
+ logger.warn("onNewPeerCreated callback failed for group {}", groupId, e);
+ }
+ }
}
@Override
public void deleteLocalPeer(ConsensusGroupId groupId) throws ConsensusException {
KillPoint.setKillPoint(IoTConsensusDeleteLocalPeerKillPoints.BEFORE_DELETE);
+
+ // Notify subscription system before stopping the peer, so that subscription queues can
+ // properly unregister from the still-alive serverImpl.
+ final Consumer removeCallback = onPeerRemoved;
+ if (removeCallback != null) {
+ try {
+ removeCallback.accept(groupId);
+ } catch (final Exception e) {
+ logger.warn("onPeerRemoved callback failed for group {}", groupId, e);
+ }
+ }
+
AtomicBoolean exist = new AtomicBoolean(false);
stateMachineMap.computeIfPresent(
groupId,
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
index 567261efffffa..37222c47d35ff 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
@@ -89,13 +89,16 @@
import java.util.PriorityQueue;
import java.util.TreeSet;
import java.util.UUID;
+import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
+import java.util.function.LongSupplier;
import java.util.regex.Pattern;
import static org.apache.iotdb.commons.utils.FileUtils.humanReadableByteCountSI;
@@ -128,6 +131,14 @@ public class IoTConsensusServerImpl {
IoTConsensusRateLimiter.getInstance();
private IndexedConsensusRequest lastConsensusRequest;
+ // Subscription queues receive IndexedConsensusRequest in real-time from write(),
+ // similar to LogDispatcher, enabling in-memory data delivery without waiting for WAL flush.
+ private final List> subscriptionQueues =
+ new CopyOnWriteArrayList<>();
+ // Suppliers that report each subscription consumer's acknowledged search index.
+ // Used to pin WAL files: entries >= min(suppliers) cannot be deleted.
+ private final List subscriptionSyncIndexSuppliers = new CopyOnWriteArrayList<>();
+
public IoTConsensusServerImpl(
String storageDir,
Peer thisNode,
@@ -236,6 +247,44 @@ public TSStatus write(IConsensusRequest request) {
// in one transaction.
synchronized (searchIndex) {
logDispatcher.offer(indexedConsensusRequest);
+ // Deliver to subscription queues for real-time in-memory consumption.
+ // Offer AFTER stateMachine.write() so that InsertNode has inferred types
+ // and properly typed values (same timing as LogDispatcher).
+ final int sqCount = subscriptionQueues.size();
+ if (sqCount > 0) {
+ logger.debug(
+ "write() offering to {} subscription queue(s), "
+ + "group={}, searchIndex={}, requestType={}",
+ sqCount,
+ consensusGroupId,
+ indexedConsensusRequest.getSearchIndex(),
+ indexedConsensusRequest.getRequests().isEmpty()
+ ? "EMPTY"
+ : indexedConsensusRequest.getRequests().get(0).getClass().getSimpleName());
+ for (final BlockingQueue sq : subscriptionQueues) {
+ final boolean offered = sq.offer(indexedConsensusRequest);
+ logger.debug(
+ "offer result={}, queueSize={}, queueRemaining={}",
+ offered,
+ sq.size(),
+ sq.remainingCapacity());
+ if (!offered) {
+ logger.warn(
+ "Subscription queue full, dropped entry searchIndex={}",
+ indexedConsensusRequest.getSearchIndex());
+ }
+ }
+ } else {
+ // Log periodically when no subscription queues are registered
+ if (indexedConsensusRequest.getSearchIndex() % 50 == 0) {
+ logger.debug(
+ "write() no subscription queues registered, "
+ + "group={}, searchIndex={}, this={}",
+ consensusGroupId,
+ indexedConsensusRequest.getSearchIndex(),
+ System.identityHashCode(this));
+ }
+ }
searchIndex.incrementAndGet();
}
// statistic the time of offering request into queue
@@ -243,10 +292,13 @@ public TSStatus write(IConsensusRequest request) {
System.nanoTime() - writeToStateMachineEndTime);
} else {
logger.debug(
- "{}: write operation failed. searchIndex: {}. Code: {}",
+ "write operation FAILED. group={}, searchIndex={}, code={}, "
+ + "subscriptionQueues={}, this={}",
thisNode.getGroupId(),
indexedConsensusRequest.getSearchIndex(),
- result.getCode());
+ result.getCode(),
+ subscriptionQueues.size(),
+ System.identityHashCode(this));
}
// statistic the time of total write process
ioTConsensusServerMetrics.recordConsensusWriteTime(
@@ -757,6 +809,47 @@ public long getSearchIndex() {
return searchIndex.get();
}
+ public ConsensusReqReader getConsensusReqReader() {
+ return consensusReqReader;
+ }
+
+ /**
+ * Registers a subscription pending queue for real-time in-memory data delivery. When {@link
+ * #write(IConsensusRequest)} succeeds, the IndexedConsensusRequest is offered to all registered
+ * subscription queues, enabling subscription consumers to receive data without waiting for WAL
+ * flush.
+ *
+ * @param queue the blocking queue to receive IndexedConsensusRequest entries
+ * @param syncIndexSupplier supplies the subscription consumer's current acknowledged search
+ * index, used by WAL pinning to prevent deletion of unacknowledged entries
+ */
+ public void registerSubscriptionQueue(
+ final BlockingQueue queue, final LongSupplier syncIndexSupplier) {
+ subscriptionQueues.add(queue);
+ subscriptionSyncIndexSuppliers.add(syncIndexSupplier);
+ // Immediately re-evaluate the safe delete index to protect WAL for this subscriber
+ checkAndUpdateSafeDeletedSearchIndex();
+ logger.info(
+ "Registered subscription queue for group {}, "
+ + "total subscription queues: {}, currentSearchIndex={}, this={}",
+ consensusGroupId,
+ subscriptionQueues.size(),
+ searchIndex.get(),
+ System.identityHashCode(this));
+ }
+
+ public void unregisterSubscriptionQueue(
+ final BlockingQueue queue, final LongSupplier syncIndexSupplier) {
+ subscriptionQueues.remove(queue);
+ subscriptionSyncIndexSuppliers.remove(syncIndexSupplier);
+ // Re-evaluate: with fewer subscribers, more WAL may be deletable
+ checkAndUpdateSafeDeletedSearchIndex();
+ logger.info(
+ "Unregistered subscription queue for group {}, remaining subscription queues: {}",
+ consensusGroupId,
+ subscriptionQueues.size());
+ }
+
public long getSyncLag() {
long minSyncIndex = getMinSyncIndex();
return getSearchIndex() - minSyncIndex;
@@ -875,14 +968,29 @@ void checkAndUpdateIndex() {
* If there is only one replica, set it to Long.MAX_VALUE. If there are multiple replicas, get the
* latest SafelyDeletedSearchIndex again. This enables wal to be deleted in a timely manner.
*/
- void checkAndUpdateSafeDeletedSearchIndex() {
+ public void checkAndUpdateSafeDeletedSearchIndex() {
if (configuration.isEmpty()) {
logger.error(
"Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time.");
- } else if (configuration.size() == 1) {
+ return;
+ }
+
+ // Compute the minimum search index that subscription consumers still need.
+ // WAL entries at or after this index must be preserved.
+ long minSubscriptionIndex = Long.MAX_VALUE;
+ for (final LongSupplier supplier : subscriptionSyncIndexSuppliers) {
+ minSubscriptionIndex = Math.min(minSubscriptionIndex, supplier.getAsLong());
+ }
+
+ if (configuration.size() == 1 && subscriptionSyncIndexSuppliers.isEmpty()) {
+ // Single replica, no subscription consumers => delete all WAL freely
consensusReqReader.setSafelyDeletedSearchIndex(Long.MAX_VALUE);
} else {
- consensusReqReader.setSafelyDeletedSearchIndex(getMinFlushedSyncIndex());
+ // min(replication progress, subscription progress) — preserve WAL for both
+ final long replicationIndex =
+ configuration.size() > 1 ? getMinFlushedSyncIndex() : Long.MAX_VALUE;
+ consensusReqReader.setSafelyDeletedSearchIndex(
+ Math.min(replicationIndex, minSubscriptionIndex));
}
}
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java
index 374691bf38bf1..51704a24c74a5 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java
@@ -167,15 +167,16 @@ public synchronized OptionalLong getMinFlushedSyncIndex() {
return threads.stream().mapToLong(LogDispatcherThread::getLastFlushedSyncIndex).min();
}
- public void checkAndFlushIndex() {
+ public synchronized void checkAndFlushIndex() {
if (!threads.isEmpty()) {
threads.forEach(
thread -> {
IndexController controller = thread.getController();
controller.update(controller.getCurrentIndex(), true);
});
- // do not set SafelyDeletedSearchIndex as it is Long.MAX_VALUE when replica is 1
- reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex());
+ // Use subscription-aware safe-delete to avoid deleting WAL entries
+ // still needed by subscription consumers.
+ impl.checkAndUpdateSafeDeletedSearchIndex();
}
}
@@ -397,8 +398,9 @@ public void updateSafelyDeletedSearchIndex() {
// indicating that insert nodes whose search index are before this value can be deleted
// safely.
//
- // Use minFlushedSyncIndex here to reserve the WAL which are not flushed and support kill -9.
- reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex());
+ // Use subscription-aware safe-delete to avoid deleting WAL entries
+ // still needed by subscription consumers.
+ impl.checkAndUpdateSafeDeletedSearchIndex();
// notify
if (impl.unblockWrite()) {
impl.signal();
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
index 510f8559bc147..abf9161962bff 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
@@ -19,7 +19,12 @@
package org.apache.iotdb.db.subscription.agent;
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.db.subscription.broker.ConsensusSubscriptionBroker;
import org.apache.iotdb.db.subscription.broker.SubscriptionBroker;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler;
import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager;
import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask;
@@ -30,6 +35,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -43,7 +50,12 @@ public class SubscriptionBrokerAgent {
private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBrokerAgent.class);
- private final Map consumerGroupIdToSubscriptionBroker =
+ /** Pipe-based subscription brokers, one per consumer group. */
+ private final Map consumerGroupIdToPipeBroker =
+ new ConcurrentHashMap<>();
+
+ /** Consensus-based subscription brokers, one per consumer group. */
+ private final Map consumerGroupIdToConsensusBroker =
new ConcurrentHashMap<>();
private final Cache prefetchingQueueCount =
@@ -54,17 +66,54 @@ public class SubscriptionBrokerAgent {
public List poll(
final ConsumerConfig consumerConfig, final Set topicNames, final long maxBytes) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String consumerId = consumerConfig.getConsumerId();
+ final List allEvents = new ArrayList<>();
+ long remainingBytes = maxBytes;
+
+ // Poll from pipe-based broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.nonNull(pipeBroker)) {
+ final List pipeEvents =
+ pipeBroker.poll(consumerId, topicNames, remainingBytes);
+ allEvents.addAll(pipeEvents);
+ for (final SubscriptionEvent event : pipeEvents) {
+ try {
+ remainingBytes -= event.getCurrentResponseSize();
+ } catch (final IOException ignored) {
+ // best effort
+ }
+ }
+ }
+
+ // Poll from consensus-based broker
+ if (remainingBytes > 0) {
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker)) {
+ LOGGER.debug(
+ "SubscriptionBrokerAgent: polling consensus broker for consumer group [{}], "
+ + "topicNames={}, remainingBytes={}",
+ consumerGroupId,
+ topicNames,
+ remainingBytes);
+ allEvents.addAll(consensusBroker.poll(consumerId, topicNames, remainingBytes));
+ } else {
+ LOGGER.debug(
+ "SubscriptionBrokerAgent: no consensus broker for consumer group [{}]",
+ consumerGroupId);
+ }
+ }
+
+ if (allEvents.isEmpty()
+ && Objects.isNull(pipeBroker)
+ && Objects.isNull(consumerGroupIdToConsensusBroker.get(consumerGroupId))) {
final String errorMessage =
- String.format(
- "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
+ String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
- // TODO: currently we fetch messages from all topics
- final String consumerId = consumerConfig.getConsumerId();
- return broker.poll(consumerId, topicNames, maxBytes);
+
+ return allEvents;
}
public List pollTsFile(
@@ -72,16 +121,18 @@ public List pollTsFile(
final SubscriptionCommitContext commitContext,
final long writingOffset) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // TsFile polling can only be called by pipe-based subscriptions
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
final String errorMessage =
String.format(
- "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
+ "Subscription: pipe broker bound to consumer group [%s] does not exist",
+ consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
final String consumerId = consumerConfig.getConsumerId();
- return broker.pollTsFile(consumerId, commitContext, writingOffset);
+ return pipeBroker.pollTsFile(consumerId, commitContext, writingOffset);
}
public List pollTablets(
@@ -89,16 +140,26 @@ public List pollTablets(
final SubscriptionCommitContext commitContext,
final int offset) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String consumerId = consumerConfig.getConsumerId();
+ final String topicName = commitContext.getTopicName();
+
+ // Try consensus-based broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.pollTablets(consumerId, commitContext, offset);
+ }
+
+ // Fall back to pipe-based broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
final String errorMessage =
String.format(
"Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
- final String consumerId = consumerConfig.getConsumerId();
- return broker.pollTablets(consumerId, commitContext, offset);
+ return pipeBroker.pollTablets(consumerId, commitContext, offset);
}
/**
@@ -109,46 +170,99 @@ public List commit(
final List commitContexts,
final boolean nack) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String consumerId = consumerConfig.getConsumerId();
+ final List allSuccessful = new ArrayList<>();
+
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+
+ if (Objects.isNull(pipeBroker) && Objects.isNull(consensusBroker)) {
final String errorMessage =
- String.format(
- "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
+ String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
- final String consumerId = consumerConfig.getConsumerId();
- return broker.commit(consumerId, commitContexts, nack);
+
+ // Partition commit contexts by which broker owns the topic.
+ final List pipeContexts = new ArrayList<>();
+ final List consensusContexts = new ArrayList<>();
+ for (final SubscriptionCommitContext ctx : commitContexts) {
+ final String topicName = ctx.getTopicName();
+ if (Objects.nonNull(consensusBroker)
+ && ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) {
+ consensusContexts.add(ctx);
+ } else {
+ pipeContexts.add(ctx);
+ }
+ }
+
+ if (Objects.nonNull(pipeBroker) && !pipeContexts.isEmpty()) {
+ allSuccessful.addAll(pipeBroker.commit(consumerId, pipeContexts, nack));
+ }
+ if (Objects.nonNull(consensusBroker) && !consensusContexts.isEmpty()) {
+ allSuccessful.addAll(consensusBroker.commit(consumerId, consensusContexts, nack));
+ }
+
+ return allSuccessful;
}
public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
final String consumerGroupId = commitContext.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String topicName = commitContext.getTopicName();
+
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.isCommitContextOutdated(commitContext);
+ }
+
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
return true;
}
- return broker.isCommitContextOutdated(commitContext);
+ return pipeBroker.isCommitContextOutdated(commitContext);
}
public List fetchTopicNamesToUnsubscribe(
final ConsumerConfig consumerConfig, final Set topicNames) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+
+ // Consensus-based subscription topics are unbounded streams, so they do not trigger
+ // auto-unsubscribe.
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ final Set pipeOnlyTopicNames;
+ if (Objects.nonNull(consensusBroker)) {
+ pipeOnlyTopicNames = new java.util.HashSet<>(topicNames);
+ pipeOnlyTopicNames.removeIf(consensusBroker::hasQueue);
+ } else {
+ pipeOnlyTopicNames = topicNames;
+ }
+
+ if (pipeOnlyTopicNames.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
return Collections.emptyList();
}
- return broker.fetchTopicNamesToUnsubscribe(topicNames);
+ return pipeBroker.fetchTopicNamesToUnsubscribe(pipeOnlyTopicNames);
}
/////////////////////////////// broker ///////////////////////////////
public boolean isBrokerExist(final String consumerGroupId) {
- return consumerGroupIdToSubscriptionBroker.containsKey(consumerGroupId);
+ return consumerGroupIdToPipeBroker.containsKey(consumerGroupId)
+ || consumerGroupIdToConsensusBroker.containsKey(consumerGroupId);
}
public void createBrokerIfNotExist(final String consumerGroupId) {
- consumerGroupIdToSubscriptionBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new);
- LOGGER.info("Subscription: create broker bound to consumer group [{}]", consumerGroupId);
+ consumerGroupIdToPipeBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new);
+ LOGGER.info("Subscription: create pipe broker bound to consumer group [{}]", consumerGroupId);
}
/**
@@ -156,26 +270,46 @@ public void createBrokerIfNotExist(final String consumerGroupId) {
*/
public boolean dropBroker(final String consumerGroupId) {
final AtomicBoolean dropped = new AtomicBoolean(false);
- consumerGroupIdToSubscriptionBroker.compute(
+
+ // Drop pipe broker
+ consumerGroupIdToPipeBroker.compute(
consumerGroupId,
(id, broker) -> {
if (Objects.isNull(broker)) {
+ dropped.set(true);
+ return null;
+ }
+ if (!broker.isEmpty()) {
LOGGER.warn(
- "Subscription: broker bound to consumer group [{}] does not exist",
+ "Subscription: pipe broker bound to consumer group [{}] is not empty when dropping",
consumerGroupId);
- dropped.set(true);
+ return broker;
+ }
+ dropped.set(true);
+ LOGGER.info(
+ "Subscription: drop pipe broker bound to consumer group [{}]", consumerGroupId);
+ return null;
+ });
+
+ // Drop consensus broker
+ consumerGroupIdToConsensusBroker.compute(
+ consumerGroupId,
+ (id, broker) -> {
+ if (Objects.isNull(broker)) {
return null;
}
if (!broker.isEmpty()) {
LOGGER.warn(
- "Subscription: broker bound to consumer group [{}] is not empty when dropping",
+ "Subscription: consensus broker bound to consumer group [{}] is not empty when dropping",
consumerGroupId);
return broker;
}
dropped.set(true);
- LOGGER.info("Subscription: drop broker bound to consumer group [{}]", consumerGroupId);
- return null; // remove this entry
+ LOGGER.info(
+ "Subscription: drop consensus broker bound to consumer group [{}]", consumerGroupId);
+ return null;
});
+
return dropped.get();
}
@@ -183,15 +317,14 @@ public boolean dropBroker(final String consumerGroupId) {
public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) {
final String consumerGroupId = subtask.getConsumerGroupId();
- consumerGroupIdToSubscriptionBroker
+ consumerGroupIdToPipeBroker
.compute(
consumerGroupId,
(id, broker) -> {
if (Objects.isNull(broker)) {
LOGGER.info(
- "Subscription: broker bound to consumer group [{}] does not exist, create new for binding prefetching queue",
+ "Subscription: pipe broker bound to consumer group [{}] does not exist, create new for binding prefetching queue",
consumerGroupId);
- // TODO: consider more robust metadata semantics
return new SubscriptionBroker(consumerGroupId);
}
return broker;
@@ -200,41 +333,119 @@ public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) {
prefetchingQueueCount.invalidate();
}
- public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
+ public void bindConsensusPrefetchingQueue(
+ final String consumerGroupId,
+ final String topicName,
+ final String consensusGroupId,
+ final IoTConsensusServerImpl serverImpl,
+ final ConsensusLogToTabletConverter converter,
+ final ConsensusSubscriptionCommitManager commitManager,
+ final long startSearchIndex) {
+ consumerGroupIdToConsensusBroker
+ .compute(
+ consumerGroupId,
+ (id, broker) -> {
+ if (Objects.isNull(broker)) {
+ LOGGER.info(
+ "Subscription: consensus broker bound to consumer group [{}] does not exist, create new for binding consensus prefetching queue",
+ consumerGroupId);
+ return new ConsensusSubscriptionBroker(consumerGroupId);
+ }
+ return broker;
+ })
+ .bindConsensusPrefetchingQueue(
+ topicName, consensusGroupId, serverImpl, converter, commitManager, startSearchIndex);
+ prefetchingQueueCount.invalidate();
+ }
+
+ public void unbindConsensusPrefetchingQueue(
+ final String consumerGroupId, final String topicName) {
+ final ConsensusSubscriptionBroker broker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
if (Objects.isNull(broker)) {
LOGGER.warn(
- "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
+ "Subscription: consensus broker bound to consumer group [{}] does not exist",
+ consumerGroupId);
+ return;
+ }
+ broker.unbindConsensusPrefetchingQueue(topicName);
+ prefetchingQueueCount.invalidate();
+ }
+
+ public void unbindByRegion(final String regionId) {
+ int totalClosed = 0;
+ for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) {
+ totalClosed += broker.unbindByRegion(regionId);
+ }
+ if (totalClosed > 0) {
+ prefetchingQueueCount.invalidate();
+ LOGGER.info(
+ "Subscription: unbound {} consensus prefetching queue(s) for removed region [{}]",
+ totalClosed,
+ regionId);
+ }
+ }
+
+ public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) {
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
+ LOGGER.warn(
+ "Subscription: pipe broker bound to consumer group [{}] does not exist", consumerGroupId);
return;
}
- broker.updateCompletedTopicNames(topicName);
+ pipeBroker.updateCompletedTopicNames(topicName);
}
public void unbindPrefetchingQueue(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ consensusBroker.removeQueue(topicName);
+ prefetchingQueueCount.invalidate();
+ return;
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
LOGGER.warn(
"Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
return;
}
- broker.unbindPrefetchingQueue(topicName);
+ pipeBroker.unbindPrefetchingQueue(topicName);
prefetchingQueueCount.invalidate();
}
public void removePrefetchingQueue(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ consensusBroker.removeQueue(topicName);
+ prefetchingQueueCount.invalidate();
+ return;
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
LOGGER.warn(
"Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
return;
}
- broker.removePrefetchingQueue(topicName);
+ pipeBroker.removePrefetchingQueue(topicName);
prefetchingQueueCount.invalidate();
}
public boolean executePrefetch(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.executePrefetch(topicName);
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
SubscriptionDataNodeResourceManager.log()
.schedule(SubscriptionBrokerAgent.class, consumerGroupId, topicName)
.ifPresent(
@@ -244,17 +455,24 @@ public boolean executePrefetch(final String consumerGroupId, final String topicN
consumerGroupId));
return false;
}
- return broker.executePrefetch(topicName);
+ return pipeBroker.executePrefetch(topicName);
}
public int getPipeEventCount(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.getEventCount(topicName);
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
LOGGER.warn(
"Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
return 0;
}
- return broker.getPipeEventCount(topicName);
+ return pipeBroker.getPipeEventCount(topicName);
}
public int getPrefetchingQueueCount() {
@@ -262,9 +480,15 @@ public int getPrefetchingQueueCount() {
}
private int getPrefetchingQueueCountInternal() {
- return consumerGroupIdToSubscriptionBroker.values().stream()
- .map(SubscriptionBroker::getPrefetchingQueueCount)
- .reduce(0, Integer::sum);
+ int count =
+ consumerGroupIdToPipeBroker.values().stream()
+ .map(SubscriptionBroker::getPrefetchingQueueCount)
+ .reduce(0, Integer::sum);
+ count +=
+ consumerGroupIdToConsensusBroker.values().stream()
+ .map(ConsensusSubscriptionBroker::getQueueCount)
+ .reduce(0, Integer::sum);
+ return count;
}
/////////////////////////////// Cache ///////////////////////////////
@@ -272,14 +496,15 @@ private int getPrefetchingQueueCountInternal() {
/**
* A simple generic cache that computes and stores a value on demand.
*
- * Note that since the get() and invalidate() methods are not modified with synchronized, the
- * value obtained may not be entirely accurate.
+ *
Both {@code value} and {@code valid} are volatile to ensure visibility across threads. The
+ * {@code get()} method uses a local snapshot of {@code valid} to avoid double-read reordering.
+ * Concurrent recomputation by multiple threads is benign (idempotent supplier).
*
* @param the type of the cached value
*/
private static class Cache {
- private T value;
+ private volatile T value;
private volatile boolean valid = false;
private final Supplier supplier;
@@ -304,8 +529,10 @@ private void invalidate() {
*/
private T get() {
if (!valid) {
- value = supplier.get();
+ final T computed = supplier.get();
+ value = computed;
valid = true;
+ return computed;
}
return value;
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java
index fee23cf6af4cb..9c54497b6f468 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java
@@ -21,6 +21,7 @@
import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta;
import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler;
import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage;
import org.apache.iotdb.rpc.subscription.exception.SubscriptionException;
@@ -132,11 +133,34 @@ private void handleSingleConsumerGroupMetaChangesInternal(
for (final String topicName : topicsUnsubByGroup) {
SubscriptionAgent.broker().removePrefetchingQueue(consumerGroupId, topicName);
}
+ // Tear down consensus-based subscriptions for unsubscribed topics
+ if (!topicsUnsubByGroup.isEmpty()) {
+ ConsensusSubscriptionSetupHandler.teardownConsensusSubscriptions(
+ consumerGroupId, topicsUnsubByGroup);
+ }
+
+ // Detect newly subscribed topics (present in new meta but not in old meta)
+ final Set newlySubscribedTopics =
+ ConsumerGroupMeta.getTopicsNewlySubByGroup(metaInAgent, metaFromCoordinator);
+
+ LOGGER.info(
+ "Subscription: consumer group [{}] meta change detected, "
+ + "topicsUnsubByGroup={}, newlySubscribedTopics={}",
+ consumerGroupId,
+ topicsUnsubByGroup,
+ newlySubscribedTopics);
// TODO: Currently we fully replace the entire ConsumerGroupMeta without carefully checking the
// changes in its fields.
consumerGroupMetaKeeper.removeConsumerGroupMeta(consumerGroupId);
consumerGroupMetaKeeper.addConsumerGroupMeta(consumerGroupId, metaFromCoordinator);
+
+ // Set up consensus-based subscription for newly subscribed live-mode topics.
+ // This must happen after the meta is updated so that the broker can find the topic config.
+ if (!newlySubscribedTopics.isEmpty()) {
+ ConsensusSubscriptionSetupHandler.handleNewSubscriptions(
+ consumerGroupId, newlySubscribedTopics);
+ }
}
public TPushConsumerGroupMetaRespExceptionMessage handleConsumerGroupMetaChanges(
@@ -222,4 +246,24 @@ public Set getTopicNamesSubscribedByConsumer(
releaseReadLock();
}
}
+
+ /**
+ * Get all active subscriptions: consumerGroupId → set of subscribed topic names. Used by
+ * consensus subscription auto-binding when a new DataRegion is created.
+ */
+ public java.util.Map> getAllSubscriptions() {
+ acquireReadLock();
+ try {
+ final java.util.Map> result = new java.util.HashMap<>();
+ for (final ConsumerGroupMeta meta : consumerGroupMetaKeeper.getAllConsumerGroupMeta()) {
+ final Set topics = meta.getSubscribedTopicNames();
+ if (!topics.isEmpty()) {
+ result.put(meta.getConsumerGroupId(), new java.util.HashSet<>(topics));
+ }
+ }
+ return result;
+ } finally {
+ releaseReadLock();
+ }
+ }
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java
new file mode 100644
index 0000000000000..1c567965d911b
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker;
+
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager;
+import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+/**
+ * Consensus-based subscription broker that reads data directly from IoTConsensus WAL. Each instance
+ * manages consensus prefetching queues for a single consumer group.
+ */
+public class ConsensusSubscriptionBroker implements ISubscriptionBroker {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusSubscriptionBroker.class);
+
+ private final String brokerId; // consumer group id
+
+ /** Maps topic name to a list of ConsensusPrefetchingQueues, one per data region. */
+ private final Map> topicNameToConsensusPrefetchingQueues;
+
+ /** Shared commit ID generators per topic. */
+ private final Map topicNameToCommitIdGenerator;
+
+ public ConsensusSubscriptionBroker(final String brokerId) {
+ this.brokerId = brokerId;
+ this.topicNameToConsensusPrefetchingQueues = new ConcurrentHashMap<>();
+ this.topicNameToCommitIdGenerator = new ConcurrentHashMap<>();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return topicNameToConsensusPrefetchingQueues.isEmpty();
+ }
+
+ @Override
+ public boolean hasQueue(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ return Objects.nonNull(queues)
+ && !queues.isEmpty()
+ && queues.stream().anyMatch(q -> !q.isClosed());
+ }
+
+ //////////////////////////// poll ////////////////////////////
+
+ @Override
+ public List poll(
+ final String consumerId, final Set topicNames, final long maxBytes) {
+ LOGGER.debug(
+ "ConsensusSubscriptionBroker [{}]: poll called, consumerId={}, topicNames={}, "
+ + "queueCount={}, maxBytes={}",
+ brokerId,
+ consumerId,
+ topicNames,
+ topicNameToConsensusPrefetchingQueues.size(),
+ maxBytes);
+
+ final List eventsToPoll = new ArrayList<>();
+ final List eventsToNack = new ArrayList<>();
+ long totalSize = 0;
+
+ for (final String topicName : topicNames) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ continue;
+ }
+
+ // Poll from all region queues for this topic
+ for (final ConsensusPrefetchingQueue consensusQueue : queues) {
+ if (consensusQueue.isClosed()) {
+ continue;
+ }
+
+ final SubscriptionEvent event = consensusQueue.poll(consumerId);
+ if (Objects.isNull(event)) {
+ continue;
+ }
+
+ final long currentSize;
+ try {
+ currentSize = event.getCurrentResponseSize();
+ } catch (final IOException e) {
+ eventsToNack.add(event);
+ continue;
+ }
+
+ eventsToPoll.add(event);
+ totalSize += currentSize;
+
+ if (totalSize >= maxBytes) {
+ break;
+ }
+ }
+
+ if (totalSize >= maxBytes) {
+ break;
+ }
+ }
+
+ // Nack any events that had errors
+ if (!eventsToNack.isEmpty()) {
+ commit(
+ consumerId,
+ eventsToNack.stream()
+ .map(SubscriptionEvent::getCommitContext)
+ .collect(Collectors.toList()),
+ true);
+ }
+
+ LOGGER.debug(
+ "ConsensusSubscriptionBroker [{}]: poll result, consumerId={}, eventsPolled={}, eventsNacked={}",
+ brokerId,
+ consumerId,
+ eventsToPoll.size(),
+ eventsToNack.size());
+
+ return eventsToPoll;
+ }
+
+ @Override
+ public List pollTablets(
+ final String consumerId, final SubscriptionCommitContext commitContext, final int offset) {
+ final String topicName = commitContext.getTopicName();
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ // Try each region queue until one returns a match
+ for (final ConsensusPrefetchingQueue consensusQueue : queues) {
+ if (consensusQueue.isClosed()) {
+ continue;
+ }
+ final SubscriptionEvent event = consensusQueue.pollTablets(consumerId, commitContext, offset);
+ if (Objects.nonNull(event)) {
+ return Collections.singletonList(event);
+ }
+ }
+ return Collections.emptyList();
+ }
+
+ //////////////////////////// commit ////////////////////////////
+
+ @Override
+ public List commit(
+ final String consumerId,
+ final List commitContexts,
+ final boolean nack) {
+ final List successfulCommitContexts = new ArrayList<>();
+ for (final SubscriptionCommitContext commitContext : commitContexts) {
+ final String topicName = commitContext.getTopicName();
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ LOGGER.warn(
+ "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to commit",
+ brokerId,
+ topicName);
+ continue;
+ }
+
+ // Try each region queue for this topic (the event belongs to exactly one region).
+ // Don't warn per-queue miss — only warn if NO queue handled the commit.
+ boolean handled = false;
+ for (final ConsensusPrefetchingQueue consensusQueue : queues) {
+ if (consensusQueue.isClosed()) {
+ continue;
+ }
+ final boolean success;
+ if (!nack) {
+ success = consensusQueue.ackSilent(consumerId, commitContext);
+ } else {
+ success = consensusQueue.nackSilent(consumerId, commitContext);
+ }
+ if (success) {
+ successfulCommitContexts.add(commitContext);
+ handled = true;
+ break; // committed in the right queue, no need to try others
+ }
+ }
+ if (!handled) {
+ LOGGER.warn(
+ "ConsensusSubscriptionBroker [{}]: commit context {} not found in any of {} region queue(s) for topic [{}]",
+ brokerId,
+ commitContext,
+ queues.size(),
+ topicName);
+ }
+ }
+ return successfulCommitContexts;
+ }
+
+ @Override
+ public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
+ final String topicName = commitContext.getTopicName();
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ return true;
+ }
+ // Any queue that considers it NOT outdated means it's not outdated
+ for (final ConsensusPrefetchingQueue q : queues) {
+ if (!q.isCommitContextOutdated(commitContext)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ //////////////////////////// prefetching ////////////////////////////
+
+ @Override
+ public boolean executePrefetch(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ return false;
+ }
+ boolean anyPrefetched = false;
+ for (final ConsensusPrefetchingQueue q : queues) {
+ if (!q.isClosed() && q.executePrefetch()) {
+ anyPrefetched = true;
+ }
+ }
+ return anyPrefetched;
+ }
+
+ @Override
+ public int getEventCount(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues)) {
+ return 0;
+ }
+ return queues.stream().mapToInt(ConsensusPrefetchingQueue::getPrefetchedEventCount).sum();
+ }
+
+ @Override
+ public int getQueueCount() {
+ return topicNameToConsensusPrefetchingQueues.size();
+ }
+
+ //////////////////////////// queue management ////////////////////////////
+
+ public void bindConsensusPrefetchingQueue(
+ final String topicName,
+ final String consensusGroupId,
+ final IoTConsensusServerImpl serverImpl,
+ final ConsensusLogToTabletConverter converter,
+ final ConsensusSubscriptionCommitManager commitManager,
+ final long startSearchIndex) {
+ // Get or create the list of queues for this topic
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.computeIfAbsent(
+ topicName, k -> new CopyOnWriteArrayList<>());
+
+ // Check for duplicate region binding
+ for (final ConsensusPrefetchingQueue existing : queues) {
+ if (consensusGroupId.equals(existing.getConsensusGroupId()) && !existing.isClosed()) {
+ LOGGER.info(
+ "Subscription: consensus prefetching queue for topic [{}], region [{}] "
+ + "in consumer group [{}] already exists, skipping",
+ topicName,
+ consensusGroupId,
+ brokerId);
+ return;
+ }
+ }
+
+ // Get or create the shared commit ID generator for this topic
+ final AtomicLong sharedCommitIdGenerator =
+ topicNameToCommitIdGenerator.computeIfAbsent(topicName, k -> new AtomicLong(0));
+
+ final ConsensusPrefetchingQueue consensusQueue =
+ new ConsensusPrefetchingQueue(
+ brokerId,
+ topicName,
+ consensusGroupId,
+ serverImpl,
+ converter,
+ commitManager,
+ startSearchIndex,
+ sharedCommitIdGenerator);
+ queues.add(consensusQueue);
+ LOGGER.info(
+ "Subscription: create consensus prefetching queue bound to topic [{}] for consumer group [{}], "
+ + "consensusGroupId={}, startSearchIndex={}, totalRegionQueues={}",
+ topicName,
+ brokerId,
+ consensusGroupId,
+ startSearchIndex,
+ queues.size());
+ }
+
+ public void unbindConsensusPrefetchingQueue(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ LOGGER.warn(
+ "Subscription: consensus prefetching queues bound to topic [{}] for consumer group [{}] do not exist",
+ topicName,
+ brokerId);
+ return;
+ }
+
+ for (final ConsensusPrefetchingQueue q : queues) {
+ q.close();
+ }
+ topicNameToConsensusPrefetchingQueues.remove(topicName);
+ topicNameToCommitIdGenerator.remove(topicName);
+ LOGGER.info(
+ "Subscription: drop all {} consensus prefetching queue(s) bound to topic [{}] for consumer group [{}]",
+ queues.size(),
+ topicName,
+ brokerId);
+ }
+
+ public int unbindByRegion(final String regionId) {
+ int closedCount = 0;
+ for (final Map.Entry> entry :
+ topicNameToConsensusPrefetchingQueues.entrySet()) {
+ final List queues = entry.getValue();
+ final Iterator iterator = queues.iterator();
+ while (iterator.hasNext()) {
+ final ConsensusPrefetchingQueue q = iterator.next();
+ if (regionId.equals(q.getConsensusGroupId())) {
+ q.close();
+ iterator.remove();
+ closedCount++;
+ LOGGER.info(
+ "Subscription: closed consensus prefetching queue for topic [{}] region [{}] "
+ + "in consumer group [{}] due to region removal",
+ entry.getKey(),
+ regionId,
+ brokerId);
+ }
+ }
+ }
+ return closedCount;
+ }
+
+ @Override
+ public void removeQueue(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.nonNull(queues) && !queues.isEmpty()) {
+ LOGGER.info(
+ "Subscription: consensus prefetching queue(s) bound to topic [{}] for consumer group [{}] still exist, unbind before closing",
+ topicName,
+ brokerId);
+ unbindConsensusPrefetchingQueue(topicName);
+ }
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java
new file mode 100644
index 0000000000000..aaa88a5f84777
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker;
+
+import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext;
+
+import java.util.List;
+import java.util.Set;
+
+public interface ISubscriptionBroker {
+
+ List poll(String consumerId, Set topicNames, long maxBytes);
+
+ List pollTablets(
+ String consumerId, SubscriptionCommitContext commitContext, int offset);
+
+ List commit(
+ String consumerId, List commitContexts, boolean nack);
+
+ boolean isCommitContextOutdated(SubscriptionCommitContext commitContext);
+
+ boolean executePrefetch(String topicName);
+
+ int getEventCount(String topicName);
+
+ int getQueueCount();
+
+ void removeQueue(String topicName);
+
+ boolean isEmpty();
+
+ boolean hasQueue(String topicName);
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java
index cc03f7261419b..8f9d05324e905 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java
@@ -56,7 +56,7 @@
import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID;
-public class SubscriptionBroker {
+public class SubscriptionBroker implements ISubscriptionBroker {
private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBroker.class);
@@ -83,14 +83,23 @@ public SubscriptionBroker(final String brokerId) {
.build(consumerId -> new SubscriptionStates());
}
+ @Override
public boolean isEmpty() {
return topicNameToPrefetchingQueue.isEmpty()
&& completedTopicNames.isEmpty()
&& topicNameToCommitIdGenerator.isEmpty();
}
+ @Override
+ public boolean hasQueue(final String topicName) {
+ final SubscriptionPrefetchingQueue prefetchingQueue =
+ topicNameToPrefetchingQueue.get(topicName);
+ return Objects.nonNull(prefetchingQueue) && !prefetchingQueue.isClosed();
+ }
+
//////////////////////////// provided for SubscriptionBrokerAgent ////////////////////////////
+ @Override
public List poll(
final String consumerId, final Set topicNames, final long maxBytes) {
final List eventsToPoll = new ArrayList<>();
@@ -112,9 +121,10 @@ public List poll(
// Iterate over each sorted topic name and poll the corresponding events
int remainingTopicSize = sortedTopicNames.size();
for (final String topicName : sortedTopicNames) {
+ remainingTopicSize -= 1;
+ // Check pipe-based queue
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
- remainingTopicSize -= 1;
// Recheck
if (Objects.isNull(prefetchingQueue) || prefetchingQueue.isClosed()) {
@@ -182,6 +192,7 @@ private Set prepareCandidateTopicNames(
final List eventsToPoll /* output parameter */) {
final Set candidateTopicNames = new HashSet<>();
for (final String topicName : topicNames) {
+ // Check pipe-based queue
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
// If there is no prefetching queue for the topic, check if it's completed
@@ -271,6 +282,7 @@ public List pollTsFile(
return Collections.emptyList();
}
+ @Override
public List pollTablets(
final String consumerId, final SubscriptionCommitContext commitContext, final int offset) {
final String topicName = commitContext.getTopicName();
@@ -312,6 +324,7 @@ public List pollTablets(
/**
* @return list of successful commit contexts
*/
+ @Override
public List commit(
final String consumerId,
final List commitContexts,
@@ -348,6 +361,7 @@ public List commit(
return successfulCommitContexts;
}
+ @Override
public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
final String topicName = commitContext.getTopicName();
final SubscriptionPrefetchingQueue prefetchingQueue =
@@ -457,6 +471,11 @@ public void unbindPrefetchingQueue(final String topicName) {
brokerId);
}
+ @Override
+ public void removeQueue(final String topicName) {
+ removePrefetchingQueue(topicName);
+ }
+
public void removePrefetchingQueue(final String topicName) {
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
@@ -473,6 +492,7 @@ public void removePrefetchingQueue(final String topicName) {
topicNameToCommitIdGenerator.remove(topicName);
}
+ @Override
public boolean executePrefetch(final String topicName) {
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
@@ -505,6 +525,11 @@ public boolean executePrefetch(final String topicName) {
: prefetchingQueue.executePrefetchV2();
}
+ @Override
+ public int getEventCount(final String topicName) {
+ return getPipeEventCount(topicName);
+ }
+
public int getPipeEventCount(final String topicName) {
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
@@ -525,6 +550,11 @@ public int getPipeEventCount(final String topicName) {
return prefetchingQueue.getPipeEventCount();
}
+ @Override
+ public int getQueueCount() {
+ return getPrefetchingQueueCount();
+ }
+
public int getPrefetchingQueueCount() {
return topicNameToPrefetchingQueue.size();
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java
new file mode 100644
index 0000000000000..9d3f2b283c556
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowsNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertTabletNode;
+
+import org.apache.tsfile.enums.TSDataType;
+import org.apache.tsfile.file.metadata.IDeviceID;
+import org.apache.tsfile.utils.Binary;
+import org.apache.tsfile.utils.BitMap;
+import org.apache.tsfile.write.record.Tablet;
+import org.apache.tsfile.write.schema.IMeasurementSchema;
+import org.apache.tsfile.write.schema.MeasurementSchema;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** Converts IoTConsensus WAL log entries (InsertNode) to Tablet format for subscription. */
+public class ConsensusLogToTabletConverter {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusLogToTabletConverter.class);
+
+ private final TreePattern treePattern;
+ private final TablePattern tablePattern;
+
+ /**
+ * The actual database name of the DataRegion this converter processes (table-model format without
+ * "root." prefix). Null for tree-model topics.
+ */
+ private final String databaseName;
+
+ public ConsensusLogToTabletConverter(
+ final TreePattern treePattern, final TablePattern tablePattern, final String databaseName) {
+ this.treePattern = treePattern;
+ this.tablePattern = tablePattern;
+ this.databaseName = databaseName;
+ }
+
+ public String getDatabaseName() {
+ return databaseName;
+ }
+
+ static String safeDeviceIdForLog(final InsertNode node) {
+ try {
+ final Object deviceId = node.getDeviceID();
+ return deviceId != null ? deviceId.toString() : "null";
+ } catch (final Exception e) {
+ return "N/A(" + node.getType() + ")";
+ }
+ }
+
+ public List convert(final InsertNode insertNode) {
+ if (Objects.isNull(insertNode)) {
+ return Collections.emptyList();
+ }
+
+ final PlanNodeType nodeType = insertNode.getType();
+ if (nodeType == null) {
+ LOGGER.warn("InsertNode type is null, skipping conversion");
+ return Collections.emptyList();
+ }
+
+ LOGGER.debug(
+ "ConsensusLogToTabletConverter: converting InsertNode type={}, deviceId={}",
+ nodeType,
+ safeDeviceIdForLog(insertNode));
+
+ switch (nodeType) {
+ case INSERT_ROW:
+ return convertInsertRowNode((InsertRowNode) insertNode);
+ case INSERT_TABLET:
+ return convertInsertTabletNode((InsertTabletNode) insertNode);
+ case INSERT_ROWS:
+ return convertInsertRowsNode((InsertRowsNode) insertNode);
+ case INSERT_ROWS_OF_ONE_DEVICE:
+ return convertInsertRowsOfOneDeviceNode((InsertRowsOfOneDeviceNode) insertNode);
+ case INSERT_MULTI_TABLET:
+ return convertInsertMultiTabletsNode((InsertMultiTabletsNode) insertNode);
+ case RELATIONAL_INSERT_ROW:
+ return convertRelationalInsertRowNode((RelationalInsertRowNode) insertNode);
+ case RELATIONAL_INSERT_TABLET:
+ return convertRelationalInsertTabletNode((RelationalInsertTabletNode) insertNode);
+ case RELATIONAL_INSERT_ROWS:
+ return convertRelationalInsertRowsNode((RelationalInsertRowsNode) insertNode);
+ default:
+ LOGGER.debug("Unsupported InsertNode type for subscription: {}", nodeType);
+ return Collections.emptyList();
+ }
+ }
+
+ // ======================== Tree Model Conversion ========================
+
+ private List convertInsertRowNode(final InsertRowNode node) {
+ final IDeviceID deviceId = node.getDeviceID();
+
+ // Device-level path filtering
+ if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) {
+ return Collections.emptyList();
+ }
+
+ final long time = node.getTime();
+
+ // Determine which columns match the pattern
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final Object[] values = node.getValues();
+ final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements);
+
+ if (matchedColumnIndices.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ // Build Tablet with matched columns
+ final int columnCount = matchedColumnIndices.size();
+ final List schemas = new ArrayList<>(columnCount);
+ for (final int colIdx : matchedColumnIndices) {
+ schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx]));
+ }
+
+ final Tablet tablet = new Tablet(deviceId.toString(), schemas, 1 /* maxRowNumber */);
+ tablet.addTimestamp(0, time);
+
+ for (int i = 0; i < columnCount; i++) {
+ final int originalColIdx = matchedColumnIndices.get(i);
+ final Object value = values[originalColIdx];
+ if (value == null) {
+ if (tablet.getBitMaps() == null) {
+ tablet.initBitMaps();
+ }
+ tablet.getBitMaps()[i].mark(0);
+ } else {
+ addValueToTablet(tablet, 0, i, dataTypes[originalColIdx], value);
+ }
+ }
+ tablet.setRowSize(1);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertInsertTabletNode(final InsertTabletNode node) {
+ final IDeviceID deviceId = node.getDeviceID();
+
+ // Device-level path filtering
+ if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) {
+ return Collections.emptyList();
+ }
+
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final long[] times = node.getTimes();
+ final Object[] columns = node.getColumns();
+ final BitMap[] bitMaps = node.getBitMaps();
+ final int rowCount = node.getRowCount();
+
+ // Column filtering
+ final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements);
+ if (matchedColumnIndices.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ final int columnCount = matchedColumnIndices.size();
+ final boolean allColumnsMatch = (columnCount == measurements.length);
+
+ // Build schemas (always needed)
+ final List schemas = new ArrayList<>(columnCount);
+ for (final int colIdx : matchedColumnIndices) {
+ schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx]));
+ }
+
+ // Build column arrays and bitmaps using bulk copy
+ final long[] newTimes = Arrays.copyOf(times, rowCount);
+ final Object[] newColumns = new Object[columnCount];
+ final BitMap[] newBitMaps = new BitMap[columnCount];
+
+ for (int i = 0; i < columnCount; i++) {
+ final int originalColIdx = allColumnsMatch ? i : matchedColumnIndices.get(i);
+ newColumns[i] = copyColumnArray(dataTypes[originalColIdx], columns[originalColIdx], rowCount);
+ if (bitMaps != null && bitMaps[originalColIdx] != null) {
+ newBitMaps[i] = new BitMap(rowCount);
+ BitMap.copyOfRange(bitMaps[originalColIdx], 0, newBitMaps[i], 0, rowCount);
+ }
+ }
+
+ final Tablet tablet =
+ new Tablet(deviceId.toString(), schemas, newTimes, newColumns, newBitMaps, rowCount);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertInsertRowsNode(final InsertRowsNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertRowNode rowNode : node.getInsertRowNodeList()) {
+ // Handle merge bug: RelationalInsertRowNode.mergeInsertNode() is not overridden,
+ // so merged relational nodes arrive as InsertRowsNode (tree) with RelationalInsertRowNode
+ // children. Dispatch correctly by checking the actual child type.
+ if (rowNode instanceof RelationalInsertRowNode) {
+ tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode));
+ } else {
+ tablets.addAll(convertInsertRowNode(rowNode));
+ }
+ }
+ return tablets;
+ }
+
+ private List convertInsertRowsOfOneDeviceNode(final InsertRowsOfOneDeviceNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertRowNode rowNode : node.getInsertRowNodeList()) {
+ tablets.addAll(convertInsertRowNode(rowNode));
+ }
+ return tablets;
+ }
+
+ private List convertInsertMultiTabletsNode(final InsertMultiTabletsNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertTabletNode tabletNode : node.getInsertTabletNodeList()) {
+ tablets.addAll(convertInsertTabletNode(tabletNode));
+ }
+ return tablets;
+ }
+
+ // ======================== Table Model Conversion ========================
+
+ private List convertRelationalInsertRowNode(final RelationalInsertRowNode node) {
+ final String tableName = node.getTableName();
+
+ // Table-level pattern filtering
+ if (tablePattern != null) {
+ if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) {
+ return Collections.emptyList();
+ }
+ if (tableName != null && !tablePattern.matchesTable(tableName)) {
+ return Collections.emptyList();
+ }
+ }
+
+ final long time = node.getTime();
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final Object[] values = node.getValues();
+
+ final int columnCount = measurements.length;
+ final List schemas = new ArrayList<>(columnCount);
+ for (int i = 0; i < columnCount; i++) {
+ schemas.add(new MeasurementSchema(measurements[i], dataTypes[i]));
+ }
+
+ final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, 1);
+ tablet.addTimestamp(0, time);
+
+ for (int i = 0; i < columnCount; i++) {
+ final Object value = values[i];
+ if (value == null) {
+ if (tablet.getBitMaps() == null) {
+ tablet.initBitMaps();
+ }
+ tablet.getBitMaps()[i].mark(0);
+ } else {
+ addValueToTablet(tablet, 0, i, dataTypes[i], value);
+ }
+ }
+ tablet.setRowSize(1);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertRelationalInsertTabletNode(final RelationalInsertTabletNode node) {
+ final String tableName = node.getTableName();
+
+ // Table-level pattern filtering
+ if (tablePattern != null) {
+ if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) {
+ return Collections.emptyList();
+ }
+ if (tableName != null && !tablePattern.matchesTable(tableName)) {
+ return Collections.emptyList();
+ }
+ }
+
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final long[] times = node.getTimes();
+ final Object[] columns = node.getColumns();
+ final BitMap[] bitMaps = node.getBitMaps();
+ final int rowCount = node.getRowCount();
+
+ final int columnCount = measurements.length;
+ final List schemas = new ArrayList<>(columnCount);
+ for (int i = 0; i < columnCount; i++) {
+ schemas.add(new MeasurementSchema(measurements[i], dataTypes[i]));
+ }
+
+ // Build column arrays and bitmaps using bulk copy
+ final long[] newTimes = Arrays.copyOf(times, rowCount);
+ final Object[] newColumns = new Object[columnCount];
+ final BitMap[] newBitMaps = new BitMap[columnCount];
+
+ for (int colIdx = 0; colIdx < columnCount; colIdx++) {
+ newColumns[colIdx] = copyColumnArray(dataTypes[colIdx], columns[colIdx], rowCount);
+ if (bitMaps != null && bitMaps[colIdx] != null) {
+ newBitMaps[colIdx] = new BitMap(rowCount);
+ BitMap.copyOfRange(bitMaps[colIdx], 0, newBitMaps[colIdx], 0, rowCount);
+ }
+ }
+
+ final Tablet tablet =
+ new Tablet(
+ tableName != null ? tableName : "",
+ schemas,
+ newTimes,
+ newColumns,
+ newBitMaps,
+ rowCount);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertRelationalInsertRowsNode(final RelationalInsertRowsNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertRowNode rowNode : node.getInsertRowNodeList()) {
+ tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode));
+ }
+ return tablets;
+ }
+
+ // ======================== Helper Methods ========================
+
+ /**
+ * Returns indices of columns that match the tree pattern. If no tree pattern is specified, all
+ * column indices are returned.
+ */
+ private List getMatchedTreeColumnIndices(
+ final IDeviceID deviceId, final String[] measurements) {
+ if (treePattern == null || treePattern.isRoot() || treePattern.coversDevice(deviceId)) {
+ // All columns match
+ final List allIndices = new ArrayList<>(measurements.length);
+ for (int i = 0; i < measurements.length; i++) {
+ if (measurements[i] != null) {
+ allIndices.add(i);
+ }
+ }
+ return allIndices;
+ }
+
+ final List matchedIndices = new ArrayList<>();
+ for (int i = 0; i < measurements.length; i++) {
+ if (measurements[i] != null && treePattern.matchesMeasurement(deviceId, measurements[i])) {
+ matchedIndices.add(i);
+ }
+ }
+ return matchedIndices;
+ }
+
+ /**
+ * Bulk-copies a typed column array using System.arraycopy. Returns a new array of the same type
+ * containing the first {@code rowCount} elements.
+ */
+ private Object copyColumnArray(
+ final TSDataType dataType, final Object sourceColumn, final int rowCount) {
+ switch (dataType) {
+ case BOOLEAN:
+ {
+ final boolean[] src = (boolean[]) sourceColumn;
+ final boolean[] dst = new boolean[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case INT32:
+ case DATE:
+ {
+ final int[] src = (int[]) sourceColumn;
+ final int[] dst = new int[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case INT64:
+ case TIMESTAMP:
+ {
+ final long[] src = (long[]) sourceColumn;
+ final long[] dst = new long[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case FLOAT:
+ {
+ final float[] src = (float[]) sourceColumn;
+ final float[] dst = new float[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case DOUBLE:
+ {
+ final double[] src = (double[]) sourceColumn;
+ final double[] dst = new double[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case TEXT:
+ case BLOB:
+ case STRING:
+ {
+ final Binary[] src = (Binary[]) sourceColumn;
+ final Binary[] dst = new Binary[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ default:
+ LOGGER.warn("Unsupported data type for bulk copy: {}", dataType);
+ return sourceColumn;
+ }
+ }
+
+ /**
+ * Adds a single value to the tablet at the specified position.
+ *
+ * IMPORTANT: In tsfile-2.2.1, Tablet.addTimestamp() calls initBitMapsWithApiUsage() which
+ * creates bitMaps and marks ALL positions as null via markAll(). Since we write values directly
+ * to the underlying typed arrays (bypassing the Tablet.addValue() API which would call
+ * updateBitMap to unmark), we must explicitly unmark the bitmap position to indicate the value is
+ * NOT null.
+ */
+ private void addValueToTablet(
+ final Tablet tablet,
+ final int rowIndex,
+ final int columnIndex,
+ final TSDataType dataType,
+ final Object value) {
+ switch (dataType) {
+ case BOOLEAN:
+ ((boolean[]) tablet.getValues()[columnIndex])[rowIndex] = (boolean) value;
+ break;
+ case INT32:
+ case DATE:
+ ((int[]) tablet.getValues()[columnIndex])[rowIndex] = (int) value;
+ break;
+ case INT64:
+ case TIMESTAMP:
+ ((long[]) tablet.getValues()[columnIndex])[rowIndex] = (long) value;
+ break;
+ case FLOAT:
+ ((float[]) tablet.getValues()[columnIndex])[rowIndex] = (float) value;
+ break;
+ case DOUBLE:
+ ((double[]) tablet.getValues()[columnIndex])[rowIndex] = (double) value;
+ break;
+ case TEXT:
+ case BLOB:
+ case STRING:
+ ((Binary[]) tablet.getValues()[columnIndex])[rowIndex] = (Binary) value;
+ break;
+ default:
+ LOGGER.warn("Unsupported data type: {}", dataType);
+ return;
+ }
+ // Unmark the bitmap position to indicate this value is NOT null.
+ // addTimestamp() triggers initBitMapsWithApiUsage() which marks all positions as null.
+ final BitMap[] bitMaps = tablet.getBitMaps();
+ if (bitMaps != null && bitMaps[columnIndex] != null) {
+ bitMaps[columnIndex].unmark(rowIndex);
+ }
+ }
+
+ /** Copies a single column value from the source column array to the tablet. */
+ private void copyColumnValue(
+ final Tablet tablet,
+ final int targetRowIndex,
+ final int targetColumnIndex,
+ final TSDataType dataType,
+ final Object sourceColumn,
+ final int sourceRowIndex) {
+ switch (dataType) {
+ case BOOLEAN:
+ ((boolean[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((boolean[]) sourceColumn)[sourceRowIndex];
+ break;
+ case INT32:
+ case DATE:
+ ((int[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((int[]) sourceColumn)[sourceRowIndex];
+ break;
+ case INT64:
+ case TIMESTAMP:
+ ((long[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((long[]) sourceColumn)[sourceRowIndex];
+ break;
+ case FLOAT:
+ ((float[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((float[]) sourceColumn)[sourceRowIndex];
+ break;
+ case DOUBLE:
+ ((double[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((double[]) sourceColumn)[sourceRowIndex];
+ break;
+ case TEXT:
+ case BLOB:
+ case STRING:
+ ((Binary[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((Binary[]) sourceColumn)[sourceRowIndex];
+ break;
+ default:
+ LOGGER.warn("Unsupported data type for copy: {}", dataType);
+ return;
+ }
+ // Unmark the bitmap position to indicate this value is NOT null.
+ final BitMap[] bitMaps = tablet.getBitMaps();
+ if (bitMaps != null && bitMaps[targetColumnIndex] != null) {
+ bitMaps[targetColumnIndex].unmark(targetRowIndex);
+ }
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java
new file mode 100644
index 0000000000000..8b5c2cf25a8e5
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java
@@ -0,0 +1,1271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.commons.subscription.config.SubscriptionConfig;
+import org.apache.iotdb.consensus.common.request.IConsensusRequest;
+import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest;
+import org.apache.iotdb.consensus.common.request.IoTConsensusRequest;
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.consensus.iot.log.ConsensusReqReader;
+import org.apache.iotdb.db.conf.IoTDBDescriptor;
+import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode;
+import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry;
+import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode;
+import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
+import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType;
+import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload;
+
+import org.apache.tsfile.utils.Pair;
+import org.apache.tsfile.write.record.Tablet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.PriorityBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.function.LongSupplier;
+
+import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID;
+
+/**
+ * A prefetching queue that reads data from IoTConsensus using a hybrid approach:
+ *
+ *
+ * - In-memory pending queue: Registered with {@link IoTConsensusServerImpl}, receives
+ * {@link IndexedConsensusRequest} in real-time from the write path (same mechanism as
+ * LogDispatcher). This avoids waiting for WAL flush to disk.
+ *
- WAL fallback: Uses {@link ConsensusReqReader.ReqIterator} to read from WAL files for
+ * gap-filling (pending queue overflow) or catch-up scenarios.
+ *
- WAL pinning: Supplies the earliest outstanding (uncommitted) search index to {@link
+ * IoTConsensusServerImpl}, preventing WAL deletion of entries not yet consumed by the
+ * subscription.
+ *
+ *
+ * A background prefetch thread continuously drains the pending queue, converts InsertNode
+ * entries to Tablets via {@link ConsensusLogToTabletConverter}, and enqueues {@link
+ * SubscriptionEvent} objects into the prefetchingQueue for consumer polling.
+ *
+ *
This design mirrors LogDispatcher's dual-path (pendingEntries + WAL reader) but targets
+ * subscription delivery instead of replication.
+ *
+ *
Thread safety: Uses a fair {@link ReentrantReadWriteLock} to ensure mutual exclusion between
+ * cleanup and other operations (poll, ack, nack), consistent with the existing prefetching queue
+ * design.
+ */
+public class ConsensusPrefetchingQueue {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchingQueue.class);
+
+ private final String brokerId; // consumer group id
+ private final String topicName;
+ private final String consensusGroupId;
+
+ private final IoTConsensusServerImpl serverImpl;
+
+ private final ConsensusReqReader consensusReqReader;
+
+ private volatile ConsensusReqReader.ReqIterator reqIterator;
+
+ /**
+ * In-memory pending queue registered with {@link IoTConsensusServerImpl#write}. Receives
+ * IndexedConsensusRequest in real-time without waiting for WAL flush. Capacity is bounded to
+ * apply back-pressure; overflows are filled from WAL.
+ */
+ private final BlockingQueue pendingEntries;
+
+ private static final int PENDING_QUEUE_CAPACITY = 4096;
+
+ private final ConsensusLogToTabletConverter converter;
+
+ private final ConsensusSubscriptionCommitManager commitManager;
+
+ /**
+ * Cached LongSupplier instance for WAL pinning registration. Must be the SAME object reference
+ * for both registerSubscriptionQueue and unregisterSubscriptionQueue, because
+ * CopyOnWriteArrayList.remove() uses equals() which defaults to reference equality for lambdas.
+ * Using this::method would create a new lambda instance each time, causing remove() to fail and
+ * WAL to be pinned indefinitely.
+ */
+ private final LongSupplier walPinSupplier;
+
+ /** Commit ID generator, monotonically increasing within this queue's lifetime. */
+ private final AtomicLong commitIdGenerator;
+
+ /** Records the initial commit ID for outdated event detection. */
+ private final long initialCommitId;
+
+ private final AtomicLong nextExpectedSearchIndex;
+
+ private final PriorityBlockingQueue prefetchingQueue;
+
+ /**
+ * Tracks in-flight events that have been polled but not yet committed. Key: (consumerId,
+ * commitContext) -> event.
+ */
+ private final Map, SubscriptionEvent> inFlightEvents;
+
+ /**
+ * Tracks outstanding (uncommitted) events for WAL pinning. Maps commitId to the startSearchIndex
+ * of that event batch. The earliest entry's value is supplied to IoTConsensusServerImpl to pin
+ * WAL files from deletion.
+ */
+ private final ConcurrentSkipListMap outstandingCommitIdToStartIndex;
+
+ private static final int MAX_TABLETS_PER_EVENT = 64;
+
+ private static final int MAX_WAL_ENTRIES_PER_PREFETCH = 128;
+
+ private static final int MAX_PREFETCHING_QUEUE_SIZE = 256;
+
+ private static final long WAL_RETENTION_WARN_THRESHOLD = 100_000;
+
+ /** Counter of WAL gap entries that could not be filled (data loss). */
+ private final AtomicLong walGapSkippedEntries = new AtomicLong(0);
+
+ private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true);
+
+ private volatile boolean isClosed = false;
+
+ /**
+ * Background thread that drains pendingEntries and fills prefetchingQueue. TODO: manage thread
+ * count
+ */
+ private final Thread prefetchThread;
+
+ public ConsensusPrefetchingQueue(
+ final String brokerId,
+ final String topicName,
+ final String consensusGroupId,
+ final IoTConsensusServerImpl serverImpl,
+ final ConsensusLogToTabletConverter converter,
+ final ConsensusSubscriptionCommitManager commitManager,
+ final long startSearchIndex,
+ final AtomicLong sharedCommitIdGenerator) {
+ this.brokerId = brokerId;
+ this.topicName = topicName;
+ this.consensusGroupId = consensusGroupId;
+ this.serverImpl = serverImpl;
+ this.consensusReqReader = serverImpl.getConsensusReqReader();
+ this.converter = converter;
+ this.commitManager = commitManager;
+
+ this.commitIdGenerator = sharedCommitIdGenerator;
+ this.initialCommitId = commitIdGenerator.get();
+ this.nextExpectedSearchIndex = new AtomicLong(startSearchIndex);
+ this.reqIterator = consensusReqReader.getReqIterator(startSearchIndex);
+
+ this.prefetchingQueue = new PriorityBlockingQueue<>();
+ this.inFlightEvents = new ConcurrentHashMap<>();
+ this.outstandingCommitIdToStartIndex = new ConcurrentSkipListMap<>();
+
+ // Create and register the in-memory pending queue with IoTConsensusServerImpl.
+ // IMPORTANT: walPinSupplier is stored as a field (not a method reference) to ensure the
+ // same object reference is used for both register and unregister.
+ this.pendingEntries = new ArrayBlockingQueue<>(PENDING_QUEUE_CAPACITY);
+ this.walPinSupplier = this::getEarliestOutstandingSearchIndex;
+ serverImpl.registerSubscriptionQueue(pendingEntries, walPinSupplier);
+
+ // Start background prefetch thread
+ this.prefetchThread =
+ new Thread(this::prefetchLoop, "ConsensusPrefetch-" + brokerId + "-" + topicName);
+ this.prefetchThread.setDaemon(true);
+ this.prefetchThread.start();
+
+ LOGGER.info(
+ "ConsensusPrefetchingQueue created: brokerId={}, topicName={}, consensusGroupId={}, "
+ + "startSearchIndex={}",
+ brokerId,
+ topicName,
+ consensusGroupId,
+ startSearchIndex);
+ }
+
+ /**
+ * Returns the earliest outstanding (uncommitted) search index for WAL pinning. If there are no
+ * outstanding events, returns the next expected search index (nothing to pin beyond what we've
+ * already processed). Also monitors WAL retention gap for slow consumer detection.
+ */
+ private long getEarliestOutstandingSearchIndex() {
+ final Map.Entry first = outstandingCommitIdToStartIndex.firstEntry();
+ if (first != null) {
+ final long earliestIndex = first.getValue();
+ // WAL retention health check: warn if outstanding gap grows too large
+ final long currentIndex = nextExpectedSearchIndex.get();
+ final long retentionGap = currentIndex - earliestIndex;
+ if (retentionGap > WAL_RETENTION_WARN_THRESHOLD) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: WAL retention gap is {} entries "
+ + "(earliest outstanding={}, current={}). "
+ + "A slow or stalled consumer is pinning WAL files and may cause disk exhaustion. "
+ + "Consider committing events or increasing consumer throughput.",
+ this,
+ retentionGap,
+ earliestIndex,
+ currentIndex);
+ }
+ return earliestIndex;
+ }
+ return nextExpectedSearchIndex.get();
+ }
+
+ // ======================== Lock Operations ========================
+
+ private void acquireReadLock() {
+ lock.readLock().lock();
+ }
+
+ private void releaseReadLock() {
+ lock.readLock().unlock();
+ }
+
+ private void acquireWriteLock() {
+ lock.writeLock().lock();
+ }
+
+ private void releaseWriteLock() {
+ lock.writeLock().unlock();
+ }
+
+ // ======================== Poll ========================
+
+ public SubscriptionEvent poll(final String consumerId) {
+ acquireReadLock();
+ try {
+ return isClosed ? null : pollInternal(consumerId);
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private SubscriptionEvent pollInternal(final String consumerId) {
+ // Recycle any uncommitted in-flight events for this consumer before serving new data.
+ final int recycled = recycleInFlightEventsForConsumer(consumerId);
+ if (recycled > 0) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: recycled {} uncommitted in-flight events for "
+ + "consumer {} back to prefetching queue",
+ this,
+ recycled,
+ consumerId);
+ }
+
+ final long size = prefetchingQueue.size();
+ if (size == 0) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: prefetching queue is empty for consumerId={}, "
+ + "pendingEntriesSize={}, nextExpected={}, isClosed={}, threadAlive={}",
+ this,
+ consumerId,
+ pendingEntries.size(),
+ nextExpectedSearchIndex.get(),
+ isClosed,
+ prefetchThread.isAlive());
+ return null;
+ }
+
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: polling, queue size={}, consumerId={}",
+ this,
+ size,
+ consumerId);
+ long count = 0;
+
+ SubscriptionEvent event;
+ try {
+ while (count++ < size
+ && Objects.nonNull(
+ event =
+ prefetchingQueue.poll(
+ SubscriptionConfig.getInstance().getSubscriptionPollMaxBlockingTimeMs(),
+ TimeUnit.MILLISECONDS))) {
+ if (event.isCommitted()) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {} poll committed event {} (broken invariant), remove it",
+ this,
+ event);
+ continue;
+ }
+
+ if (!event.pollable()) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {} poll non-pollable event {} (broken invariant), nack it",
+ this,
+ event);
+ event.nack();
+ continue;
+ }
+
+ // Mark as polled before updating inFlightEvents
+ event.recordLastPolledTimestamp();
+ inFlightEvents.put(new Pair<>(consumerId, event.getCommitContext()), event);
+ event.recordLastPolledConsumerId(consumerId);
+ return event;
+ }
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ LOGGER.warn("ConsensusPrefetchingQueue {} interrupted while polling", this, e);
+ }
+
+ return null;
+ }
+
+ public SubscriptionEvent pollTablets(
+ final String consumerId, final SubscriptionCommitContext commitContext, final int offset) {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return null;
+ }
+ final SubscriptionEvent event = inFlightEvents.get(new Pair<>(consumerId, commitContext));
+ if (Objects.isNull(event)) {
+ if (isCommitContextOutdated(commitContext)) {
+ return generateOutdatedErrorResponse();
+ }
+ return generateErrorResponse(
+ String.format(
+ "ConsensusPrefetchingQueue %s: no in-flight event for consumer %s, commit context %s",
+ this, consumerId, commitContext));
+ }
+ return event;
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ // ======================== Background Prefetch ========================
+
+ public boolean executePrefetch() {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return false;
+ }
+ // Recycle pollable events from inFlightEvents back to prefetchingQueue
+ recycleInFlightEvents();
+ return !prefetchingQueue.isEmpty();
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private static final long PENDING_DRAIN_TIMEOUT_MS = 200;
+
+ private static final long WAL_WAIT_TIMEOUT_SECONDS = 2;
+
+ /**
+ * Background prefetch loop. Continuously drains from pendingEntries (in-memory, real-time),
+ * detects gaps and fills from WAL reader, converts to Tablets, and enqueues SubscriptionEvents.
+ */
+ private void prefetchLoop() {
+ LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread started", this);
+ try {
+ while (!isClosed && !Thread.currentThread().isInterrupted()) {
+ try {
+ // Back-pressure: wait if prefetchingQueue is full
+ if (prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) {
+ Thread.sleep(50);
+ continue;
+ }
+
+ // Try to drain from pending entries (in-memory, fast path)
+ final List batch = new ArrayList<>();
+ // Block briefly for first entry
+ final IndexedConsensusRequest first =
+ pendingEntries.poll(PENDING_DRAIN_TIMEOUT_MS, TimeUnit.MILLISECONDS);
+ if (first != null) {
+ batch.add(first);
+ // Drain more non-blocking
+ int drained = 0;
+ IndexedConsensusRequest next;
+ while (drained < MAX_WAL_ENTRIES_PER_PREFETCH - 1
+ && (next = pendingEntries.poll()) != null) {
+ batch.add(next);
+ drained++;
+ }
+ }
+
+ if (!batch.isEmpty()) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: drained {} entries from pendingEntries, "
+ + "first searchIndex={}, last searchIndex={}, nextExpected={}, "
+ + "prefetchingQueueSize={}",
+ this,
+ batch.size(),
+ batch.get(0).getSearchIndex(),
+ batch.get(batch.size() - 1).getSearchIndex(),
+ nextExpectedSearchIndex.get(),
+ prefetchingQueue.size());
+ processBatchFromPending(batch);
+ } else {
+ // Pending queue was empty - try catch-up from WAL for any gaps
+ // (entries may have been dropped due to pending queue overflow)
+ tryCatchUpFromWAL();
+ }
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ break;
+ } catch (final Throwable t) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: CRITICAL error in prefetch loop "
+ + "(type={}, message={})",
+ this,
+ t.getClass().getName(),
+ t.getMessage(),
+ t);
+ if (t instanceof VirtualMachineError) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: caught VirtualMachineError, stopping thread", this);
+ markClosed();
+ break;
+ }
+ try {
+ Thread.sleep(100);
+ } catch (final InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+ }
+ } catch (final Throwable fatal) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: FATAL uncaught throwable escaped prefetch loop "
+ + "(type={}, message={})",
+ this,
+ fatal.getClass().getName(),
+ fatal.getMessage(),
+ fatal);
+ }
+ LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread stopped", this);
+ }
+
+ private void processBatchFromPending(final List batch) {
+ final List batchedTablets = new ArrayList<>();
+ long batchStartSearchIndex = nextExpectedSearchIndex.get();
+ long batchEndSearchIndex = batchStartSearchIndex;
+ int processedCount = 0;
+ int skippedCount = 0;
+ int nullDeserCount = 0;
+ int emptyConvertCount = 0;
+
+ for (final IndexedConsensusRequest request : batch) {
+ final long searchIndex = request.getSearchIndex();
+
+ // Detect gap: if searchIndex > nextExpected, entries were dropped from pending queue.
+ // Fill the gap from WAL.
+ final long expected = nextExpectedSearchIndex.get();
+ if (searchIndex > expected) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: gap detected, expected={}, got={}. "
+ + "Filling {} entries from WAL.",
+ this,
+ expected,
+ searchIndex,
+ searchIndex - expected);
+ final long gapMaxIndex = fillGapFromWAL(expected, searchIndex, batchedTablets);
+ if (gapMaxIndex > batchEndSearchIndex) {
+ batchEndSearchIndex = gapMaxIndex;
+ }
+
+ // If gap was not fully filled (e.g., WAL timeout), do NOT skip the gap.
+ // Break and defer remaining entries to the next prefetch loop iteration.
+ // WAL pin ensures the missing entries won't be deleted.
+ if (nextExpectedSearchIndex.get() < searchIndex) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: gap [{}, {}) not fully filled (reached {}). "
+ + "Deferring remaining batch to next prefetch iteration.",
+ this,
+ expected,
+ searchIndex,
+ nextExpectedSearchIndex.get());
+ break;
+ }
+ }
+
+ if (searchIndex < nextExpectedSearchIndex.get()) {
+ // Already processed (e.g., gap fill covered this entry), skip
+ skippedCount++;
+ continue;
+ }
+
+ // Process this entry
+ final InsertNode insertNode = deserializeToInsertNode(request);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ if (!tablets.isEmpty()) {
+ batchedTablets.addAll(tablets);
+ batchEndSearchIndex = searchIndex;
+ processedCount++;
+ } else {
+ emptyConvertCount++;
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: converter returned empty tablets for "
+ + "searchIndex={}, insertNodeType={}, deviceId={}",
+ this,
+ searchIndex,
+ insertNode.getType(),
+ ConsensusLogToTabletConverter.safeDeviceIdForLog(insertNode));
+ }
+ } else {
+ nullDeserCount++;
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: deserializeToInsertNode returned null for "
+ + "searchIndex={}, requestType={}",
+ this,
+ searchIndex,
+ request.getRequests().isEmpty()
+ ? "EMPTY"
+ : request.getRequests().get(0).getClass().getSimpleName());
+ }
+ nextExpectedSearchIndex.set(searchIndex + 1);
+
+ // Flush batch if large enough
+ if (batchedTablets.size() >= MAX_TABLETS_PER_EVENT) {
+ createAndEnqueueEvent(
+ new ArrayList<>(batchedTablets), batchStartSearchIndex, batchEndSearchIndex);
+ batchedTablets.clear();
+ // Reset start index for the next sub-batch so that
+ // outstandingCommitIdToStartIndex records the correct WAL pin position
+ batchStartSearchIndex = nextExpectedSearchIndex.get();
+ }
+ }
+
+ // Update WAL reader position to stay in sync
+ syncReqIteratorPosition();
+
+ // Flush remaining tablets
+ if (!batchedTablets.isEmpty()) {
+ createAndEnqueueEvent(batchedTablets, batchStartSearchIndex, batchEndSearchIndex);
+ }
+
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: batch processing complete, "
+ + "batchSize={}, processed={}, skipped={}, nullDeser={}, emptyConvert={}, "
+ + "tabletsCreated={}, nextExpected={}, prefetchQueueSize={}",
+ this,
+ batch.size(),
+ processedCount,
+ skippedCount,
+ nullDeserCount,
+ emptyConvertCount,
+ batchedTablets.size(),
+ nextExpectedSearchIndex.get(),
+ prefetchingQueue.size());
+ }
+
+ /**
+ * Fills a gap in the pending queue by reading entries from WAL. Called when gap is detected
+ * between nextExpectedSearchIndex and an incoming entry's searchIndex.
+ *
+ * @return the maximum searchIndex processed during gap filling, or -1 if no entries processed
+ */
+ private long fillGapFromWAL(
+ final long fromIndex, final long toIndex, final List batchedTablets) {
+ // Re-position WAL reader to the gap start
+ reqIterator = consensusReqReader.getReqIterator(fromIndex);
+ long maxProcessedIndex = -1;
+
+ while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) {
+ try {
+ final IndexedConsensusRequest walEntry = reqIterator.next();
+ final long walIndex = walEntry.getSearchIndex();
+ if (walIndex < nextExpectedSearchIndex.get()) {
+ continue; // already processed
+ }
+
+ final InsertNode insertNode = deserializeToInsertNode(walEntry);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ batchedTablets.addAll(tablets);
+ }
+ nextExpectedSearchIndex.set(walIndex + 1);
+ if (walIndex > maxProcessedIndex) {
+ maxProcessedIndex = walIndex;
+ }
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: error filling gap from WAL at index {}",
+ this,
+ nextExpectedSearchIndex.get(),
+ e);
+ break;
+ }
+ }
+
+ // If WAL doesn't have the gap entries yet (still in memory buffer), wait briefly
+ if (nextExpectedSearchIndex.get() < toIndex) {
+ try {
+ reqIterator.waitForNextReady(WAL_WAIT_TIMEOUT_SECONDS, TimeUnit.SECONDS);
+ while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) {
+ final IndexedConsensusRequest walEntry = reqIterator.next();
+ final long walIndex = walEntry.getSearchIndex();
+ if (walIndex < nextExpectedSearchIndex.get()) {
+ continue;
+ }
+ final InsertNode insertNode = deserializeToInsertNode(walEntry);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ batchedTablets.addAll(tablets);
+ }
+ nextExpectedSearchIndex.set(walIndex + 1);
+ if (walIndex > maxProcessedIndex) {
+ maxProcessedIndex = walIndex;
+ }
+ }
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ } catch (final TimeoutException e) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: timeout waiting for WAL gap fill [{}, {})",
+ this,
+ nextExpectedSearchIndex.get(),
+ toIndex);
+ }
+ }
+
+ // If the gap still cannot be fully filled (WAL truncated/deleted), skip ahead to avoid
+ // blocking consumption indefinitely. This results in data loss for the skipped range.
+ if (nextExpectedSearchIndex.get() < toIndex) {
+ final long skipped = toIndex - nextExpectedSearchIndex.get();
+ walGapSkippedEntries.addAndGet(skipped);
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: WAL gap [{}, {}) cannot be filled - {} entries lost. "
+ + "Total skipped entries so far: {}. This indicates WAL truncation or deletion.",
+ this,
+ nextExpectedSearchIndex.get(),
+ toIndex,
+ skipped,
+ walGapSkippedEntries.get());
+ nextExpectedSearchIndex.set(toIndex);
+ }
+
+ return maxProcessedIndex;
+ }
+
+ /**
+ * Try catch-up from WAL when the pending queue was empty. This handles cold-start or scenarios
+ * where the subscription started after data was already written.
+ */
+ private void tryCatchUpFromWAL() {
+ // Re-position WAL reader
+ syncReqIteratorPosition();
+
+ if (!reqIterator.hasNext()) {
+ // The WAL iterator excludes the current-writing WAL file for concurrency safety.
+ // If entries exist in WAL but are all in the current file (e.g., after pending queue
+ // overflow), we need to trigger a WAL file roll to make them readable.
+ final long currentWALIndex = consensusReqReader.getCurrentSearchIndex();
+ if (nextExpectedSearchIndex.get() <= currentWALIndex
+ && consensusReqReader instanceof WALNode) {
+ LOGGER.info(
+ "ConsensusPrefetchingQueue {}: subscription behind (at {} vs WAL {}), "
+ + "triggering WAL file roll to make entries readable",
+ this,
+ nextExpectedSearchIndex.get(),
+ currentWALIndex);
+ ((WALNode) consensusReqReader).rollWALFile();
+ syncReqIteratorPosition();
+ }
+ if (!reqIterator.hasNext()) {
+ return;
+ }
+ }
+
+ final List batchedTablets = new ArrayList<>();
+ long batchStartSearchIndex = nextExpectedSearchIndex.get();
+ long batchEndSearchIndex = batchStartSearchIndex;
+ int entriesRead = 0;
+
+ while (entriesRead < MAX_WAL_ENTRIES_PER_PREFETCH
+ && reqIterator.hasNext()
+ && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) {
+ try {
+ final IndexedConsensusRequest walEntry = reqIterator.next();
+ final long walIndex = walEntry.getSearchIndex();
+ entriesRead++;
+
+ if (walIndex < nextExpectedSearchIndex.get()) {
+ continue;
+ }
+
+ final InsertNode insertNode = deserializeToInsertNode(walEntry);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ if (!tablets.isEmpty()) {
+ batchedTablets.addAll(tablets);
+ batchEndSearchIndex = walIndex;
+ }
+ }
+ nextExpectedSearchIndex.set(walIndex + 1);
+
+ if (batchedTablets.size() >= MAX_TABLETS_PER_EVENT) {
+ createAndEnqueueEvent(
+ new ArrayList<>(batchedTablets), batchStartSearchIndex, batchEndSearchIndex);
+ batchedTablets.clear();
+ // Reset start index for the next sub-batch
+ batchStartSearchIndex = nextExpectedSearchIndex.get();
+ }
+ } catch (final Exception e) {
+ LOGGER.warn("ConsensusPrefetchingQueue {}: error reading WAL for catch-up", this, e);
+ break;
+ }
+ }
+
+ if (!batchedTablets.isEmpty()) {
+ createAndEnqueueEvent(batchedTablets, batchStartSearchIndex, batchEndSearchIndex);
+ }
+
+ if (entriesRead > 0) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: WAL catch-up read {} entries, "
+ + "nextExpectedSearchIndex={}",
+ this,
+ entriesRead,
+ nextExpectedSearchIndex.get());
+ }
+ }
+
+ /**
+ * Re-positions the WAL reader to the current nextExpectedSearchIndex. Called before reading from
+ * WAL to ensure the iterator is in sync with tracking position.
+ */
+ private void syncReqIteratorPosition() {
+ reqIterator = consensusReqReader.getReqIterator(nextExpectedSearchIndex.get());
+ }
+
+ /**
+ * Deserializes the IConsensusRequest entries within an IndexedConsensusRequest to produce an
+ * InsertNode. WAL entries are typically stored as IoTConsensusRequest (serialized ByteBuffers),
+ * and a single logical write may be split across multiple fragments (SearchNode). This method
+ * handles both cases.
+ *
+ * The deserialization follows the same pattern as {@code
+ * DataRegionStateMachine.grabPlanNode()}.
+ */
+ private InsertNode deserializeToInsertNode(final IndexedConsensusRequest indexedRequest) {
+ final List searchNodes = new ArrayList<>();
+ PlanNode nonSearchNode = null;
+
+ for (final IConsensusRequest req : indexedRequest.getRequests()) {
+ PlanNode planNode;
+ try {
+ if (req instanceof IoTConsensusRequest) {
+ // WAL entries read from file are wrapped as IoTConsensusRequest (ByteBuffer)
+ planNode = WALEntry.deserializeForConsensus(req.serializeToByteBuffer());
+ } else if (req instanceof InsertNode) {
+ // In-memory entries (not yet flushed to WAL file) may already be PlanNode
+ planNode = (PlanNode) req;
+ } else {
+ // ByteBufferConsensusRequest or unknown
+ planNode = PlanNodeType.deserialize(req.serializeToByteBuffer());
+ }
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: failed to deserialize IConsensusRequest "
+ + "(type={}) in searchIndex={}: {}",
+ this,
+ req.getClass().getSimpleName(),
+ indexedRequest.getSearchIndex(),
+ e.getMessage(),
+ e);
+ continue;
+ }
+
+ if (planNode instanceof SearchNode) {
+ ((SearchNode) planNode).setSearchIndex(indexedRequest.getSearchIndex());
+ searchNodes.add((SearchNode) planNode);
+ } else {
+ nonSearchNode = planNode;
+ }
+ }
+
+ // Merge split SearchNode fragments (same pattern as DataRegionStateMachine.grabPlanNode)
+ if (!searchNodes.isEmpty()) {
+ final PlanNode merged = searchNodes.get(0).merge(searchNodes);
+ if (merged instanceof InsertNode) {
+ final InsertNode mergedInsert = (InsertNode) merged;
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: deserialized merged InsertNode for searchIndex={}, "
+ + "type={}, deviceId={}, searchNodeCount={}",
+ this,
+ indexedRequest.getSearchIndex(),
+ mergedInsert.getType(),
+ ConsensusLogToTabletConverter.safeDeviceIdForLog(mergedInsert),
+ searchNodes.size());
+
+ return mergedInsert;
+ }
+ }
+
+ if (nonSearchNode != null) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: searchIndex={} contains non-InsertNode PlanNode: {}",
+ this,
+ indexedRequest.getSearchIndex(),
+ nonSearchNode.getClass().getSimpleName());
+ }
+
+ return null;
+ }
+
+ private void createAndEnqueueEvent(
+ final List tablets, final long startSearchIndex, final long endSearchIndex) {
+ if (tablets.isEmpty()) {
+ return;
+ }
+
+ final long commitId = commitIdGenerator.getAndIncrement();
+
+ // Record the mapping from commitId to the end searchIndex
+ // so that when the client commits, we know which WAL position has been consumed
+ commitManager.recordCommitMapping(
+ brokerId, topicName, consensusGroupId, commitId, endSearchIndex);
+
+ // Track outstanding event for WAL pinning
+ outstandingCommitIdToStartIndex.put(commitId, startSearchIndex);
+
+ final SubscriptionCommitContext commitContext =
+ new SubscriptionCommitContext(
+ IoTDBDescriptor.getInstance().getConfig().getDataNodeId(),
+ PipeDataNodeAgent.runtime().getRebootTimes(),
+ topicName,
+ brokerId,
+ commitId);
+
+ // nextOffset <= 0 means all tablets delivered in single batch
+ // -tablets.size() indicates total count
+ // Use Map> constructor with actual database name for table model;
+ final TabletsPayload payload =
+ new TabletsPayload(
+ Collections.singletonMap(converter.getDatabaseName(), tablets), -tablets.size());
+
+ final SubscriptionEvent event =
+ new SubscriptionEvent(
+ SubscriptionPollResponseType.TABLETS.getType(), payload, commitContext);
+
+ prefetchingQueue.add(event);
+
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: ENQUEUED event with {} tablets, "
+ + "searchIndex range [{}, {}], commitId={}, prefetchQueueSize={}",
+ this,
+ tablets.size(),
+ startSearchIndex,
+ endSearchIndex,
+ commitId,
+ prefetchingQueue.size());
+ }
+
+ // ======================== Commit (Ack/Nack) ========================
+
+ public boolean ack(final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ return !isClosed && ackInternal(consumerId, commitContext);
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private boolean ackInternal(
+ final String consumerId, final SubscriptionCommitContext commitContext) {
+ final AtomicBoolean acked = new AtomicBoolean(false);
+ final long commitId = commitContext.getCommitId();
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: commit context {} does not exist for ack",
+ this,
+ commitContext);
+ return null;
+ }
+
+ if (ev.isCommitted()) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: event {} already committed", this, commitContext);
+ ev.cleanUp(false);
+ return null;
+ }
+
+ ev.ack();
+ ev.recordCommittedTimestamp();
+ acked.set(true);
+
+ ev.cleanUp(false);
+ return null;
+ });
+
+ if (acked.get()) {
+ commitManager.commit(brokerId, topicName, consensusGroupId, commitId);
+ outstandingCommitIdToStartIndex.remove(commitId);
+ }
+
+ return acked.get();
+ }
+
+ public boolean nack(final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ return !isClosed && nackInternal(consumerId, commitContext);
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ /**
+ * Silent version of ack: returns false without logging if the commit context is not found. Used
+ * in multi-region iteration where only one queue owns the event.
+ */
+ public boolean ackSilent(final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return false;
+ }
+ final AtomicBoolean acked = new AtomicBoolean(false);
+ final long commitId = commitContext.getCommitId();
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ if (ev.isCommitted()) {
+ ev.cleanUp(false);
+ return null;
+ }
+ ev.ack();
+ ev.recordCommittedTimestamp();
+ acked.set(true);
+ ev.cleanUp(false);
+ return null;
+ });
+ if (acked.get()) {
+ commitManager.commit(brokerId, topicName, consensusGroupId, commitId);
+ outstandingCommitIdToStartIndex.remove(commitId);
+ }
+ return acked.get();
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ /**
+ * Silent version of nack: returns false without logging if the commit context is not found. Used
+ * in multi-region iteration where only one queue owns the event.
+ */
+ public boolean nackSilent(
+ final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return false;
+ }
+ final AtomicBoolean nacked = new AtomicBoolean(false);
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ ev.nack();
+ nacked.set(true);
+ prefetchingQueue.add(ev);
+ return null;
+ });
+ return nacked.get();
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private boolean nackInternal(
+ final String consumerId, final SubscriptionCommitContext commitContext) {
+ final AtomicBoolean nacked = new AtomicBoolean(false);
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: commit context {} does not exist for nack",
+ this,
+ commitContext);
+ return null;
+ }
+
+ ev.nack();
+ nacked.set(true);
+ prefetchingQueue.add(ev);
+ return null;
+ });
+
+ return nacked.get();
+ }
+
+ // ======================== Recycle ========================
+
+ /** Recycles in-flight events that are pollable (timed out) back to the prefetching queue. */
+ private void recycleInFlightEvents() {
+ for (final Pair key :
+ new ArrayList<>(inFlightEvents.keySet())) {
+ inFlightEvents.compute(
+ key,
+ (k, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ if (ev.isCommitted()) {
+ ev.cleanUp(false);
+ return null;
+ }
+ if (ev.pollable()) {
+ ev.nack();
+ prefetchingQueue.add(ev);
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: recycled timed-out event {} back to prefetching queue",
+ this,
+ ev);
+ return null;
+ }
+ return ev;
+ });
+ }
+ }
+
+ /**
+ * Maximum number of nack cycles before an in-flight event is kept in place rather than
+ * re-enqueued. Prevents infinite re-delivery loops when a consumer repeatedly polls without
+ * committing. Beyond this threshold, the event stays in inFlightEvents and will eventually be
+ * recycled by the timeout-based {@link #recycleInFlightEvents()} when it becomes pollable.
+ */
+ private static final long MAX_CONSUMER_RECYCLE_NACK_COUNT = 10;
+
+ /**
+ * Recycles uncommitted in-flight events belonging to the given consumer back to the prefetching
+ * queue. This provides at-least-once delivery: when a consumer polls again without committing,
+ * the previously delivered events are nacked and re-queued for re-delivery.
+ *
+ * Events that have been nacked more than {@link #MAX_CONSUMER_RECYCLE_NACK_COUNT} times are
+ * left in-flight to avoid infinite re-delivery loops. They will be cleaned up by the periodic
+ * timeout-based recycler instead.
+ *
+ * @return the number of events recycled
+ */
+ private int recycleInFlightEventsForConsumer(final String consumerId) {
+ final AtomicInteger count = new AtomicInteger(0);
+ for (final Pair key :
+ new ArrayList<>(inFlightEvents.keySet())) {
+ if (!key.getLeft().equals(consumerId)) {
+ continue;
+ }
+ inFlightEvents.compute(
+ key,
+ (k, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ if (ev.isCommitted()) {
+ ev.cleanUp(false);
+ return null;
+ }
+ // If the event has been nacked too many times, leave it and let the timeout recycler
+ // handle it.
+ if (ev.getNackCount() >= MAX_CONSUMER_RECYCLE_NACK_COUNT) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: event {} for consumer {} exceeded max nack "
+ + "count ({}), skipping recycle to prevent infinite loop",
+ this,
+ ev,
+ consumerId,
+ MAX_CONSUMER_RECYCLE_NACK_COUNT);
+ return ev; // keep in inFlightEvents
+ }
+ ev.nack();
+ prefetchingQueue.add(ev);
+ count.incrementAndGet();
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: recycled uncommitted event {} for consumer {} "
+ + "back to prefetching queue",
+ this,
+ ev,
+ consumerId);
+ return null;
+ });
+ }
+ return count.get();
+ }
+
+ // ======================== Cleanup ========================
+
+ public void cleanUp() {
+ acquireWriteLock();
+ try {
+ prefetchingQueue.forEach(event -> event.cleanUp(true));
+ prefetchingQueue.clear();
+
+ inFlightEvents.values().forEach(event -> event.cleanUp(true));
+ inFlightEvents.clear();
+
+ outstandingCommitIdToStartIndex.clear();
+ } finally {
+ releaseWriteLock();
+ }
+ }
+
+ public void close() {
+ markClosed();
+ // Stop background prefetch thread
+ prefetchThread.interrupt();
+ try {
+ prefetchThread.join(5000);
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ try {
+ // Unregister from IoTConsensusServerImpl (stop receiving in-memory data, unpin WAL).
+ serverImpl.unregisterSubscriptionQueue(pendingEntries, walPinSupplier);
+ } catch (final Exception e) {
+ LOGGER.warn("ConsensusPrefetchingQueue {}: error during unregister", this, e);
+ } finally {
+ try {
+ cleanUp();
+ } finally {
+ // Persist progress before closing
+ commitManager.persistAll();
+ }
+ }
+ }
+
+ private SubscriptionEvent generateErrorResponse(final String errorMessage) {
+ return new SubscriptionEvent(
+ SubscriptionPollResponseType.ERROR.getType(),
+ new ErrorPayload(errorMessage, false),
+ new SubscriptionCommitContext(
+ IoTDBDescriptor.getInstance().getConfig().getDataNodeId(),
+ PipeDataNodeAgent.runtime().getRebootTimes(),
+ topicName,
+ brokerId,
+ INVALID_COMMIT_ID));
+ }
+
+ private SubscriptionEvent generateOutdatedErrorResponse() {
+ return new SubscriptionEvent(
+ SubscriptionPollResponseType.ERROR.getType(),
+ ErrorPayload.OUTDATED_ERROR_PAYLOAD,
+ new SubscriptionCommitContext(
+ IoTDBDescriptor.getInstance().getConfig().getDataNodeId(),
+ PipeDataNodeAgent.runtime().getRebootTimes(),
+ topicName,
+ brokerId,
+ INVALID_COMMIT_ID));
+ }
+
+ public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
+ return PipeDataNodeAgent.runtime().getRebootTimes() > commitContext.getRebootTimes()
+ || initialCommitId > commitContext.getCommitId();
+ }
+
+ // ======================== Status ========================
+
+ public boolean isClosed() {
+ return isClosed;
+ }
+
+ public void markClosed() {
+ isClosed = true;
+ }
+
+ public String getPrefetchingQueueId() {
+ return brokerId + "_" + topicName;
+ }
+
+ public long getSubscriptionUncommittedEventCount() {
+ return inFlightEvents.size();
+ }
+
+ public long getCurrentCommitId() {
+ return commitIdGenerator.get();
+ }
+
+ public int getPrefetchedEventCount() {
+ return prefetchingQueue.size();
+ }
+
+ public long getCurrentReadSearchIndex() {
+ return nextExpectedSearchIndex.get();
+ }
+
+ public String getBrokerId() {
+ return brokerId;
+ }
+
+ public String getTopicName() {
+ return topicName;
+ }
+
+ public String getConsensusGroupId() {
+ return consensusGroupId;
+ }
+
+ // ======================== Stringify ========================
+
+ public Map coreReportMessage() {
+ final Map result = new HashMap<>();
+ result.put("brokerId", brokerId);
+ result.put("topicName", topicName);
+ result.put("consensusGroupId", consensusGroupId);
+ result.put("currentReadSearchIndex", String.valueOf(nextExpectedSearchIndex.get()));
+ result.put("prefetchingQueueSize", String.valueOf(prefetchingQueue.size()));
+ result.put("inFlightEventsSize", String.valueOf(inFlightEvents.size()));
+ result.put("outstandingEventsSize", String.valueOf(outstandingCommitIdToStartIndex.size()));
+ result.put("pendingEntriesSize", String.valueOf(pendingEntries.size()));
+ result.put("commitIdGenerator", commitIdGenerator.toString());
+ result.put("walGapSkippedEntries", String.valueOf(walGapSkippedEntries.get()));
+ result.put("isClosed", String.valueOf(isClosed));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "ConsensusPrefetchingQueue" + coreReportMessage();
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java
new file mode 100644
index 0000000000000..91883c94b1e11
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java
@@ -0,0 +1,425 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.db.conf.IoTDBDescriptor;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * Manages commit state for consensus-based subscriptions.
+ *
+ * This manager tracks which events have been committed by consumers and maps commit IDs back to
+ * WAL search indices. It maintains the progress for each (consumerGroup, topic, region) triple and
+ * supports persistence and recovery.
+ *
+ *
Progress is tracked per-region because searchIndex is region-local — each DataRegion
+ * has its own independent WAL with its own searchIndex namespace. Using a single state per topic
+ * would cause TreeSet deduplication bugs when different regions emit the same searchIndex value.
+ *
+ *
Key responsibilities:
+ *
+ *
+ * - Track the mapping from commitId to searchIndex
+ *
- Handle commit/ack from consumers
+ *
- Persist and recover progress state
+ *
+ */
+public class ConsensusSubscriptionCommitManager {
+
+ private static final Logger LOGGER =
+ LoggerFactory.getLogger(ConsensusSubscriptionCommitManager.class);
+
+ private static final String PROGRESS_FILE_PREFIX = "consensus_subscription_progress_";
+ private static final String PROGRESS_FILE_SUFFIX = ".dat";
+
+ /** Key: "consumerGroupId_topicName_regionId" -> progress tracking state */
+ private final Map commitStates =
+ new ConcurrentHashMap<>();
+
+ private final String persistDir;
+
+ private ConsensusSubscriptionCommitManager() {
+ this.persistDir =
+ IoTDBDescriptor.getInstance().getConfig().getSystemDir()
+ + File.separator
+ + "subscription"
+ + File.separator
+ + "consensus_progress";
+ final File dir = new File(persistDir);
+ if (!dir.exists()) {
+ dir.mkdirs();
+ }
+ }
+
+ /**
+ * Gets or creates the commit state for a specific (consumerGroup, topic, region) triple.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @return the commit state
+ */
+ public ConsensusSubscriptionCommitState getOrCreateState(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ return commitStates.computeIfAbsent(
+ key,
+ k -> {
+ // Try to recover from persisted state
+ final ConsensusSubscriptionCommitState recovered = tryRecover(key);
+ if (recovered != null) {
+ return recovered;
+ }
+ return new ConsensusSubscriptionCommitState(new SubscriptionConsensusProgress(0L, 0L));
+ });
+ }
+
+ /**
+ * Records commitId to searchIndex mapping for later commit handling.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @param commitId the assigned commit ID
+ * @param searchIndex the WAL search index corresponding to this event
+ */
+ public void recordCommitMapping(
+ final String consumerGroupId,
+ final String topicName,
+ final String regionId,
+ final long commitId,
+ final long searchIndex) {
+ final ConsensusSubscriptionCommitState state =
+ getOrCreateState(consumerGroupId, topicName, regionId);
+ state.recordMapping(commitId, searchIndex);
+ }
+
+ /**
+ * Handles commit (ack) for an event. Updates the progress and potentially advances the committed
+ * search index.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @param commitId the committed event's commit ID
+ * @return true if commit handled successfully
+ */
+ public boolean commit(
+ final String consumerGroupId,
+ final String topicName,
+ final String regionId,
+ final long commitId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ final ConsensusSubscriptionCommitState state = commitStates.get(key);
+ if (state == null) {
+ LOGGER.warn(
+ "ConsensusSubscriptionCommitManager: Cannot commit for unknown state, "
+ + "consumerGroupId={}, topicName={}, regionId={}, commitId={}",
+ consumerGroupId,
+ topicName,
+ regionId,
+ commitId);
+ return false;
+ }
+ final boolean success = state.commit(commitId);
+ if (success) {
+ // Periodically persist progress
+ persistProgressIfNeeded(key, state);
+ }
+ return success;
+ }
+
+ /**
+ * Gets the current committed search index for a specific region's state.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @return the committed search index, or -1 if no state exists
+ */
+ public long getCommittedSearchIndex(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ final ConsensusSubscriptionCommitState state = commitStates.get(key);
+ if (state == null) {
+ return -1;
+ }
+ return state.getCommittedSearchIndex();
+ }
+
+ /**
+ * Removes state for a specific (consumerGroup, topic, region) triple.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ */
+ public void removeState(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ commitStates.remove(key);
+ // Clean up persisted file
+ final File file = getProgressFile(key);
+ if (file.exists()) {
+ file.delete();
+ }
+ }
+
+ /**
+ * Removes all states for a given (consumerGroup, topic) pair across all regions. Used during
+ * subscription teardown when the individual regionIds may not be readily available.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ */
+ public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) {
+ final String prefix = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR;
+ final Iterator> it =
+ commitStates.entrySet().iterator();
+ while (it.hasNext()) {
+ final Map.Entry entry = it.next();
+ if (entry.getKey().startsWith(prefix)) {
+ it.remove();
+ final File file = getProgressFile(entry.getKey());
+ if (file.exists()) {
+ file.delete();
+ }
+ }
+ }
+ }
+
+ /** Persists all states. Should be called during graceful shutdown. */
+ public void persistAll() {
+ for (final Map.Entry entry :
+ commitStates.entrySet()) {
+ persistProgress(entry.getKey(), entry.getValue());
+ }
+ }
+
+ // ======================== Helper Methods ========================
+
+ // Use a separator that cannot appear in consumerGroupId, topicName, or regionId
+ // to prevent key collisions (e.g., "a_b" + "c" vs "a" + "b_c").
+ private static final String KEY_SEPARATOR = "##";
+
+ private String generateKey(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ return consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionId;
+ }
+
+ private File getProgressFile(final String key) {
+ return new File(persistDir, PROGRESS_FILE_PREFIX + key + PROGRESS_FILE_SUFFIX);
+ }
+
+ private ConsensusSubscriptionCommitState tryRecover(final String key) {
+ final File file = getProgressFile(key);
+ if (!file.exists()) {
+ return null;
+ }
+ try (final FileInputStream fis = new FileInputStream(file)) {
+ final byte[] bytes = new byte[(int) file.length()];
+ fis.read(bytes);
+ final ByteBuffer buffer = ByteBuffer.wrap(bytes);
+ return ConsensusSubscriptionCommitState.deserialize(buffer);
+ } catch (final IOException e) {
+ LOGGER.warn("Failed to recover consensus subscription progress from {}", file, e);
+ return null;
+ }
+ }
+
+ private void persistProgressIfNeeded(
+ final String key, final ConsensusSubscriptionCommitState state) {
+ // Persist every 100 commits to reduce disk IO
+ if (state.getProgress().getCommitIndex() % 100 == 0) {
+ persistProgress(key, state);
+ }
+ }
+
+ private void persistProgress(final String key, final ConsensusSubscriptionCommitState state) {
+ final File file = getProgressFile(key);
+ try (final FileOutputStream fos = new FileOutputStream(file);
+ final DataOutputStream dos = new DataOutputStream(fos)) {
+ state.serialize(dos);
+ dos.flush();
+ } catch (final IOException e) {
+ LOGGER.warn("Failed to persist consensus subscription progress to {}", file, e);
+ }
+ }
+
+ // ======================== Inner State Class ========================
+
+ /**
+ * Tracks commit state for a single (consumerGroup, topic, region) triple. Maintains the mapping
+ * from commitId to searchIndex and tracks committed progress within one region's WAL.
+ */
+ public static class ConsensusSubscriptionCommitState {
+
+ private final SubscriptionConsensusProgress progress;
+
+ /**
+ * Maps commitId -> searchIndex. Records which WAL search index corresponds to each committed
+ * event. Entries are removed once committed.
+ */
+ private final Map commitIdToSearchIndex = new ConcurrentHashMap<>();
+
+ /**
+ * Tracks the safe recovery position: the highest search index where all prior dispatched events
+ * have been committed. Only advances contiguously — never jumps over uncommitted gaps.
+ */
+ private volatile long committedSearchIndex;
+
+ /**
+ * Tracks the maximum search index among all committed events (may be ahead of
+ * committedSearchIndex when out-of-order commits exist). Used to update committedSearchIndex
+ * once all outstanding events are committed.
+ */
+ private long maxCommittedSearchIndex;
+
+ /**
+ * Tracks search indices of dispatched but not-yet-committed events. Used to prevent
+ * committedSearchIndex from jumping over uncommitted gaps. On commit, the frontier advances to
+ * min(outstanding) - 1 (or maxCommittedSearchIndex if empty).
+ *
+ * Since state is now per-region, searchIndex values within this set are guaranteed unique
+ * (they come from a single region's monotonically increasing WAL searchIndex).
+ */
+ private final TreeSet outstandingSearchIndices = new TreeSet<>();
+
+ public ConsensusSubscriptionCommitState(final SubscriptionConsensusProgress progress) {
+ this.progress = progress;
+ this.committedSearchIndex = progress.getSearchIndex();
+ this.maxCommittedSearchIndex = progress.getSearchIndex();
+ }
+
+ public SubscriptionConsensusProgress getProgress() {
+ return progress;
+ }
+
+ public long getCommittedSearchIndex() {
+ return committedSearchIndex;
+ }
+
+ /** Threshold for warning about outstanding (uncommitted) search indices accumulation. */
+ private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000;
+
+ public void recordMapping(final long commitId, final long searchIndex) {
+ synchronized (this) {
+ commitIdToSearchIndex.put(commitId, searchIndex);
+ outstandingSearchIndices.add(searchIndex);
+ final int size = outstandingSearchIndices.size();
+ if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) {
+ LOGGER.warn(
+ "ConsensusSubscriptionCommitState: outstandingSearchIndices size ({}) exceeds "
+ + "threshold ({}), consumers may not be committing. committedSearchIndex={}, "
+ + "maxCommittedSearchIndex={}, commitIdToSearchIndex size={}",
+ size,
+ OUTSTANDING_SIZE_WARN_THRESHOLD,
+ committedSearchIndex,
+ maxCommittedSearchIndex,
+ commitIdToSearchIndex.size());
+ }
+ }
+ }
+
+ /**
+ * Commits the specified event and advances the committed search index contiguously.
+ *
+ * The committed search index only advances to a position where all prior dispatched events
+ * have been committed. This prevents the recovery position from jumping over uncommitted gaps,
+ * ensuring at-least-once delivery even after crash recovery.
+ *
+ * @param commitId the commit ID to commit
+ * @return true if successfully committed
+ */
+ public boolean commit(final long commitId) {
+ progress.incrementCommitIndex();
+
+ // Advance committed search index contiguously (gap-aware).
+ // Both remove from commitIdToSearchIndex and outstandingSearchIndices must be
+ // inside the same synchronized block to prevent a race with recordMapping():
+ // recordMapping: put(commitId, si) -> add(si)
+ // commit: remove(commitId) -> remove(si)
+ // Without atomicity, commit could remove from map between put and add,
+ // leaving si permanently in outstandingSearchIndices (WAL leak).
+ synchronized (this) {
+ final Long searchIndex = commitIdToSearchIndex.remove(commitId);
+ if (searchIndex == null) {
+ LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId);
+ return false;
+ }
+ outstandingSearchIndices.remove(searchIndex);
+ if (searchIndex > maxCommittedSearchIndex) {
+ maxCommittedSearchIndex = searchIndex;
+ }
+
+ if (outstandingSearchIndices.isEmpty()) {
+ // All dispatched events have been committed — advance to the max
+ committedSearchIndex = maxCommittedSearchIndex;
+ } else {
+ // Advance to just below the earliest uncommitted event
+ // (never go backward)
+ committedSearchIndex =
+ Math.max(committedSearchIndex, outstandingSearchIndices.first() - 1);
+ }
+ progress.setSearchIndex(committedSearchIndex);
+ }
+
+ return true;
+ }
+
+ public void serialize(final DataOutputStream stream) throws IOException {
+ progress.serialize(stream);
+ stream.writeLong(committedSearchIndex);
+ }
+
+ public static ConsensusSubscriptionCommitState deserialize(final ByteBuffer buffer) {
+ final SubscriptionConsensusProgress progress =
+ SubscriptionConsensusProgress.deserialize(buffer);
+ final ConsensusSubscriptionCommitState state = new ConsensusSubscriptionCommitState(progress);
+ state.committedSearchIndex = buffer.getLong();
+ state.maxCommittedSearchIndex = state.committedSearchIndex;
+ return state;
+ }
+ }
+
+ // ======================== Singleton ========================
+
+ private static class Holder {
+ private static final ConsensusSubscriptionCommitManager INSTANCE =
+ new ConsensusSubscriptionCommitManager();
+ }
+
+ public static ConsensusSubscriptionCommitManager getInstance() {
+ return Holder.INSTANCE;
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java
new file mode 100644
index 0000000000000..a36b9e29fe7ed
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.commons.consensus.ConsensusGroupId;
+import org.apache.iotdb.commons.consensus.DataRegionId;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.IoTDBTreePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.PrefixTreePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern;
+import org.apache.iotdb.consensus.IConsensus;
+import org.apache.iotdb.consensus.iot.IoTConsensus;
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.db.conf.IoTDBConfig;
+import org.apache.iotdb.db.conf.IoTDBDescriptor;
+import org.apache.iotdb.db.consensus.DataRegionConsensusImpl;
+import org.apache.iotdb.db.storageengine.StorageEngine;
+import org.apache.iotdb.db.storageengine.dataregion.DataRegion;
+import org.apache.iotdb.db.subscription.agent.SubscriptionAgent;
+import org.apache.iotdb.rpc.subscription.config.TopicConfig;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Handles the setup and teardown of consensus-based subscription queues on DataNode. When a
+ * real-time subscription is detected, this handler finds the local IoTConsensus data regions,
+ * creates the appropriate converter, and binds prefetching queues to the subscription broker.
+ */
+public class ConsensusSubscriptionSetupHandler {
+
+ private static final Logger LOGGER =
+ LoggerFactory.getLogger(ConsensusSubscriptionSetupHandler.class);
+
+ private static final IoTDBConfig IOTDB_CONFIG = IoTDBDescriptor.getInstance().getConfig();
+
+ private ConsensusSubscriptionSetupHandler() {
+ // utility class
+ }
+
+ /**
+ * Ensures that the IoTConsensus new-peer and peer-removed callbacks are set, so that when a new
+ * DataRegion is created, all active consensus subscriptions are automatically bound to the new
+ * region, and when a DataRegion is removed, all subscription queues are properly cleaned up.
+ */
+ public static void ensureNewRegionListenerRegistered() {
+ if (IoTConsensus.onNewPeerCreated == null) {
+ IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated;
+ LOGGER.info(
+ "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding");
+ }
+ if (IoTConsensus.onPeerRemoved == null) {
+ IoTConsensus.onPeerRemoved = ConsensusSubscriptionSetupHandler::onRegionRemoved;
+ LOGGER.info("Set IoTConsensus.onPeerRemoved callback for consensus subscription cleanup");
+ }
+ }
+
+ /**
+ * Callback invoked when a new DataRegion (IoTConsensusServerImpl) is created locally. Queries
+ * existing subscription metadata to find all active consensus subscriptions and binds prefetching
+ * queues to the new region.
+ */
+ private static void onNewRegionCreated(
+ final ConsensusGroupId groupId, final IoTConsensusServerImpl serverImpl) {
+ if (!(groupId instanceof DataRegionId)) {
+ return;
+ }
+
+ // Query existing metadata keepers for all active subscriptions
+ final Map> allSubscriptions =
+ SubscriptionAgent.consumer().getAllSubscriptions();
+ if (allSubscriptions.isEmpty()) {
+ return;
+ }
+
+ final ConsensusSubscriptionCommitManager commitManager =
+ ConsensusSubscriptionCommitManager.getInstance();
+
+ LOGGER.info(
+ "New DataRegion {} created, checking {} consumer group(s) for auto-binding, "
+ + "currentSearchIndex={}",
+ groupId,
+ allSubscriptions.size(),
+ serverImpl.getSearchIndex());
+
+ for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) {
+ final String consumerGroupId = groupEntry.getKey();
+ for (final String topicName : groupEntry.getValue()) {
+ if (!isConsensusBasedTopic(topicName)) {
+ continue;
+ }
+ try {
+ final Map topicConfigs =
+ SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName));
+ final TopicConfig topicConfig = topicConfigs.get(topicName);
+ if (topicConfig == null) {
+ continue;
+ }
+
+ // Resolve the new DataRegion's actual database name
+ final DataRegion dataRegion =
+ StorageEngine.getInstance().getDataRegion((DataRegionId) groupId);
+ if (dataRegion == null) {
+ continue;
+ }
+ final String dbRaw = dataRegion.getDatabaseName();
+ final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw;
+
+ // For table topics, skip if this region's database doesn't match the topic filter
+ if (topicConfig.isTableTopic()) {
+ final String topicDb =
+ topicConfig.getStringOrDefault(
+ TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE);
+ if (topicDb != null
+ && !topicDb.isEmpty()
+ && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDb)
+ && !topicDb.equalsIgnoreCase(dbTableModel)) {
+ continue;
+ }
+ }
+
+ final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null;
+ final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName);
+
+ // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail
+ // for brand-new regions that have no prior subscription progress.
+ final long persistedIndex =
+ commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString());
+ final long startSearchIndex =
+ (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1;
+
+ LOGGER.info(
+ "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} "
+ + "(database={}, startSearchIndex={}, persistedIndex={})",
+ topicName,
+ consumerGroupId,
+ groupId,
+ dbTableModel,
+ startSearchIndex,
+ persistedIndex);
+
+ SubscriptionAgent.broker()
+ .bindConsensusPrefetchingQueue(
+ consumerGroupId,
+ topicName,
+ groupId.toString(),
+ serverImpl,
+ converter,
+ commitManager,
+ startSearchIndex);
+ } catch (final Exception e) {
+ LOGGER.error(
+ "Failed to auto-bind topic [{}] in group [{}] to new region {}",
+ topicName,
+ consumerGroupId,
+ groupId,
+ e);
+ }
+ }
+ }
+ }
+
+ /**
+ * Callback invoked before a DataRegion (IoTConsensusServerImpl) is deleted locally. Unbinds and
+ * cleans up all subscription prefetching queues associated with the removed region across all
+ * consumer groups.
+ */
+ private static void onRegionRemoved(final ConsensusGroupId groupId) {
+ if (!(groupId instanceof DataRegionId)) {
+ return;
+ }
+ final String regionIdStr = groupId.toString();
+ LOGGER.info(
+ "DataRegion {} being removed, unbinding all consensus subscription queues", regionIdStr);
+ try {
+ SubscriptionAgent.broker().unbindByRegion(regionIdStr);
+ } catch (final Exception e) {
+ LOGGER.error(
+ "Failed to unbind consensus subscription queues for removed region {}", regionIdStr, e);
+ }
+ }
+
+ public static boolean isConsensusBasedTopic(final String topicName) {
+ try {
+ final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName);
+ final String topicFormat = SubscriptionAgent.topic().getTopicFormat(topicName);
+ final boolean result =
+ TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode)
+ && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat);
+ LOGGER.info(
+ "isConsensusBasedTopic check for topic [{}]: mode={}, format={}, result={}",
+ topicName,
+ topicMode,
+ topicFormat,
+ result);
+ return result;
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "Failed to check if topic [{}] is consensus-based, defaulting to false", topicName, e);
+ return false;
+ }
+ }
+
+ public static void setupConsensusSubscriptions(
+ final String consumerGroupId, final Set topicNames) {
+ final IConsensus dataRegionConsensus = DataRegionConsensusImpl.getInstance();
+ if (!(dataRegionConsensus instanceof IoTConsensus)) {
+ LOGGER.warn(
+ "Data region consensus is not IoTConsensus (actual: {}), "
+ + "cannot set up consensus-based subscription for consumer group [{}]",
+ dataRegionConsensus.getClass().getSimpleName(),
+ consumerGroupId);
+ return;
+ }
+
+ // Ensure the new-region listener is registered (idempotent)
+ ensureNewRegionListenerRegistered();
+
+ final IoTConsensus ioTConsensus = (IoTConsensus) dataRegionConsensus;
+ final ConsensusSubscriptionCommitManager commitManager =
+ ConsensusSubscriptionCommitManager.getInstance();
+
+ LOGGER.info(
+ "Setting up consensus subscriptions for consumer group [{}], topics={}, "
+ + "total consensus groups={}",
+ consumerGroupId,
+ topicNames,
+ ioTConsensus.getAllConsensusGroupIds().size());
+
+ for (final String topicName : topicNames) {
+ if (!isConsensusBasedTopic(topicName)) {
+ continue;
+ }
+
+ try {
+ setupConsensusQueueForTopic(consumerGroupId, topicName, ioTConsensus, commitManager);
+ } catch (final Exception e) {
+ LOGGER.error(
+ "Failed to set up consensus subscription for topic [{}] in consumer group [{}]",
+ topicName,
+ consumerGroupId,
+ e);
+ }
+ }
+ }
+
+ /**
+ * Set up consensus queue for a single topic. Discovers all local data region consensus groups and
+ * binds a ConsensusReqReader-based prefetching queue to every matching region.
+ *
+ * For table-model topics, only regions whose database matches the topic's {@code DATABASE_KEY}
+ * filter are bound. For tree-model topics, all local data regions are bound. Additionally, the
+ * {@link #onNewRegionCreated} callback ensures that regions created after this method runs are
+ * also automatically bound.
+ */
+ private static void setupConsensusQueueForTopic(
+ final String consumerGroupId,
+ final String topicName,
+ final IoTConsensus ioTConsensus,
+ final ConsensusSubscriptionCommitManager commitManager) {
+
+ // Get topic config for building the converter
+ final Map topicConfigs =
+ SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName));
+ final TopicConfig topicConfig = topicConfigs.get(topicName);
+ if (topicConfig == null) {
+ LOGGER.warn(
+ "Topic config not found for topic [{}], cannot set up consensus queue", topicName);
+ return;
+ }
+
+ // Build the converter based on topic config (path pattern, time range, tree/table model)
+ LOGGER.info(
+ "Setting up consensus queue for topic [{}]: isTableTopic={}, config={}",
+ topicName,
+ topicConfig.isTableTopic(),
+ topicConfig.getAttribute());
+
+ // For table topics, extract the database filter from topic config
+ final String topicDatabaseFilter =
+ topicConfig.isTableTopic()
+ ? topicConfig.getStringOrDefault(
+ TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE)
+ : null;
+
+ final List allGroupIds = ioTConsensus.getAllConsensusGroupIds();
+ LOGGER.info(
+ "Discovered {} consensus group(s) for topic [{}] in consumer group [{}]: {}",
+ allGroupIds.size(),
+ topicName,
+ consumerGroupId,
+ allGroupIds);
+ boolean bound = false;
+
+ for (final ConsensusGroupId groupId : allGroupIds) {
+ if (!(groupId instanceof DataRegionId)) {
+ continue;
+ }
+
+ final IoTConsensusServerImpl serverImpl = ioTConsensus.getImpl(groupId);
+ if (serverImpl == null) {
+ continue;
+ }
+
+ // Resolve the DataRegion's actual database name
+ final DataRegion dataRegion =
+ StorageEngine.getInstance().getDataRegion((DataRegionId) groupId);
+ if (dataRegion == null) {
+ continue;
+ }
+ final String dbRaw = dataRegion.getDatabaseName();
+ final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw;
+
+ if (topicDatabaseFilter != null
+ && !topicDatabaseFilter.isEmpty()
+ && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDatabaseFilter)
+ && !topicDatabaseFilter.equalsIgnoreCase(dbTableModel)) {
+ LOGGER.info(
+ "Skipping region {} (database={}) for table topic [{}] (DATABASE_KEY={})",
+ groupId,
+ dbTableModel,
+ topicName,
+ topicDatabaseFilter);
+ continue;
+ }
+
+ final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null;
+ final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName);
+
+ // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail
+ // for brand-new regions that have no prior subscription progress.
+ final long persistedIndex =
+ commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString());
+ final long startSearchIndex =
+ (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1;
+
+ LOGGER.info(
+ "Binding consensus prefetching queue for topic [{}] in consumer group [{}] "
+ + "to data region consensus group [{}] (database={}, startSearchIndex={}, "
+ + "persistedIndex={})",
+ topicName,
+ consumerGroupId,
+ groupId,
+ dbTableModel,
+ startSearchIndex,
+ persistedIndex);
+
+ SubscriptionAgent.broker()
+ .bindConsensusPrefetchingQueue(
+ consumerGroupId,
+ topicName,
+ groupId.toString(),
+ serverImpl,
+ converter,
+ commitManager,
+ startSearchIndex);
+
+ bound = true;
+ }
+
+ if (!bound) {
+ LOGGER.warn(
+ "No local IoTConsensus data region found for topic [{}] in consumer group [{}]. "
+ + "Consensus subscription will be set up when a matching data region becomes available.",
+ topicName,
+ consumerGroupId);
+ }
+ }
+
+ private static ConsensusLogToTabletConverter buildConverter(
+ final TopicConfig topicConfig, final String actualDatabaseName) {
+ // Determine tree or table model
+ final boolean isTableTopic = topicConfig.isTableTopic();
+
+ TreePattern treePattern = null;
+ TablePattern tablePattern = null;
+
+ if (isTableTopic) {
+ // Table model: database + table name pattern
+ final String database =
+ topicConfig.getStringOrDefault(
+ TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE);
+ final String table =
+ topicConfig.getStringOrDefault(
+ TopicConstant.TABLE_KEY, TopicConstant.TABLE_DEFAULT_VALUE);
+ tablePattern = new TablePattern(true, database, table);
+ } else {
+ // Tree model: path or pattern
+ if (topicConfig.getAttribute().containsKey(TopicConstant.PATTERN_KEY)) {
+ final String pattern = topicConfig.getAttribute().get(TopicConstant.PATTERN_KEY);
+ treePattern = new PrefixTreePattern(pattern);
+ } else {
+ final String path =
+ topicConfig.getStringOrDefault(
+ TopicConstant.PATH_KEY, TopicConstant.PATH_DEFAULT_VALUE);
+ treePattern = new IoTDBTreePattern(path);
+ }
+ }
+
+ return new ConsensusLogToTabletConverter(treePattern, tablePattern, actualDatabaseName);
+ }
+
+ public static void teardownConsensusSubscriptions(
+ final String consumerGroupId, final Set topicNames) {
+ for (final String topicName : topicNames) {
+ try {
+ SubscriptionAgent.broker().unbindConsensusPrefetchingQueue(consumerGroupId, topicName);
+
+ // Clean up commit state for all regions of this topic
+ ConsensusSubscriptionCommitManager.getInstance()
+ .removeAllStatesForTopic(consumerGroupId, topicName);
+
+ LOGGER.info(
+ "Tore down consensus subscription for topic [{}] in consumer group [{}]",
+ topicName,
+ consumerGroupId);
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "Failed to tear down consensus subscription for topic [{}] in consumer group [{}]",
+ topicName,
+ consumerGroupId,
+ e);
+ }
+ }
+ }
+
+ public static void handleNewSubscriptions(
+ final String consumerGroupId, final Set newTopicNames) {
+ if (newTopicNames == null || newTopicNames.isEmpty()) {
+ return;
+ }
+
+ LOGGER.info(
+ "Checking new subscriptions in consumer group [{}] for consensus-based topics: {}",
+ consumerGroupId,
+ newTopicNames);
+
+ setupConsensusSubscriptions(consumerGroupId, newTopicNames);
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java
new file mode 100644
index 0000000000000..9e45f8a160127
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.tsfile.utils.ReadWriteIOUtils;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Objects;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Tracks consensus subscription consumption progress for a single (consumerGroup, topic, region)
+ * combination.
+ *
+ * Since searchIndex is region-local (each DataRegion has its own independent WAL and searchIndex
+ * namespace), progress is tracked per-region:
+ *
+ *
+ * - searchIndex: The committed WAL search index — the highest position where all prior
+ * dispatched events have been acknowledged. Used as the recovery start point after crash.
+ *
- commitIndex: Monotonically increasing count of committed events. Used for
+ * persistence throttling and diagnostics.
+ *
+ */
+public class SubscriptionConsensusProgress {
+
+ private final AtomicLong searchIndex;
+
+ private final AtomicLong commitIndex;
+
+ public SubscriptionConsensusProgress() {
+ this(0L, 0L);
+ }
+
+ public SubscriptionConsensusProgress(final long searchIndex, final long commitIndex) {
+ this.searchIndex = new AtomicLong(searchIndex);
+ this.commitIndex = new AtomicLong(commitIndex);
+ }
+
+ public long getSearchIndex() {
+ return searchIndex.get();
+ }
+
+ public void setSearchIndex(final long searchIndex) {
+ this.searchIndex.set(searchIndex);
+ }
+
+ public long getCommitIndex() {
+ return commitIndex.get();
+ }
+
+ public void setCommitIndex(final long commitIndex) {
+ this.commitIndex.set(commitIndex);
+ }
+
+ public void incrementCommitIndex() {
+ this.commitIndex.incrementAndGet();
+ }
+
+ public void serialize(final DataOutputStream stream) throws IOException {
+ ReadWriteIOUtils.write(searchIndex.get(), stream);
+ ReadWriteIOUtils.write(commitIndex.get(), stream);
+ }
+
+ public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) {
+ final long searchIndex = ReadWriteIOUtils.readLong(buffer);
+ final long commitIndex = ReadWriteIOUtils.readLong(buffer);
+ return new SubscriptionConsensusProgress(searchIndex, commitIndex);
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o;
+ return searchIndex.get() == that.searchIndex.get()
+ && commitIndex.get() == that.commitIndex.get();
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(searchIndex.get(), commitIndex.get());
+ }
+
+ @Override
+ public String toString() {
+ return "SubscriptionConsensusProgress{"
+ + "searchIndex="
+ + searchIndex.get()
+ + ", commitIndex="
+ + commitIndex.get()
+ + '}';
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java
index dfadee5908fa5..9ede61fbffe74 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java
@@ -248,6 +248,11 @@ public void nack() {
}
}
+ /** Returns the current nack count for this event. */
+ public long getNackCount() {
+ return nackCount.get();
+ }
+
public void recordLastPolledConsumerId(final String consumerId) {
lastPolledConsumerId = consumerId;
}
diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java
index c7e7fea8d12f8..9e9c898e3c064 100644
--- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java
+++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java
@@ -30,7 +30,7 @@ public class SubscriptionConfig {
private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig();
public boolean getSubscriptionEnabled() {
- return false;
+ return true; // TODO: make it configurable after subscription is stable
}
public float getSubscriptionCacheMemoryUsagePercentage() {
diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java
index 4393ef8a6cf61..9f66b48210bc2 100644
--- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java
+++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java
@@ -115,6 +115,26 @@ private boolean shouldRecordSubscriptionCreationTime() {
return unsubscribedTopicNames;
}
+ public static Set getTopicsNewlySubByGroup(
+ final ConsumerGroupMeta currentMeta, final ConsumerGroupMeta updatedMeta) {
+ if (!Objects.equals(currentMeta.consumerGroupId, updatedMeta.consumerGroupId)
+ || !Objects.equals(currentMeta.creationTime, updatedMeta.creationTime)) {
+ return Collections.emptySet();
+ }
+
+ final Set newlySubscribedTopicNames = new HashSet<>();
+ updatedMeta
+ .topicNameToSubscribedConsumerIdSet
+ .keySet()
+ .forEach(
+ topicName -> {
+ if (!currentMeta.topicNameToSubscribedConsumerIdSet.containsKey(topicName)) {
+ newlySubscribedTopicNames.add(topicName);
+ }
+ });
+ return newlySubscribedTopicNames;
+ }
+
/////////////////////////////// consumer ///////////////////////////////
public void checkAuthorityBeforeJoinConsumerGroup(final ConsumerMeta consumerMeta)
@@ -171,6 +191,11 @@ public ConsumerMeta getConsumerMeta(final String consumerId) {
////////////////////////// subscription //////////////////////////
+ /** Get all topic names subscribed by this consumer group. */
+ public Set getSubscribedTopicNames() {
+ return Collections.unmodifiableSet(topicNameToSubscribedConsumerIdSet.keySet());
+ }
+
/**
* Get the consumers subscribing the given topic in this group.
*