failedTests = new ArrayList<>();
+
+ public static void main(String[] args) throws Exception {
+ System.out.println("=== Consensus-Based Subscription Table Model Test Suite ===\n");
+
+ String targetTest = args.length > 0 ? args[0] : null;
+
+ if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) {
+ runTest("testBasicDataDelivery", ConsensusSubscriptionTableTest::testBasicDataDelivery);
+ }
+ if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) {
+ runTest("testMultipleDataTypes", ConsensusSubscriptionTableTest::testMultipleDataTypes);
+ }
+ if (targetTest == null || "testTableLevelFiltering".equals(targetTest)) {
+ runTest("testTableLevelFiltering", ConsensusSubscriptionTableTest::testTableLevelFiltering);
+ }
+ if (targetTest == null || "testDatabaseLevelFiltering".equals(targetTest)) {
+ runTest(
+ "testDatabaseLevelFiltering", ConsensusSubscriptionTableTest::testDatabaseLevelFiltering);
+ }
+ if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) {
+ runTest(
+ "testSubscribeBeforeRegion", ConsensusSubscriptionTableTest::testSubscribeBeforeRegion);
+ }
+ if (targetTest == null || "testMultipleTablesAggregation".equals(targetTest)) {
+ runTest(
+ "testMultipleTablesAggregation",
+ ConsensusSubscriptionTableTest::testMultipleTablesAggregation);
+ }
+ if (targetTest == null || "testMultiColumnTypes".equals(targetTest)) {
+ runTest("testMultiColumnTypes", ConsensusSubscriptionTableTest::testMultiColumnTypes);
+ }
+ if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) {
+ runTest("testPollWithoutCommit", ConsensusSubscriptionTableTest::testPollWithoutCommit);
+ }
+ if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) {
+ runTest(
+ "testMultiConsumerGroupIndependent",
+ ConsensusSubscriptionTableTest::testMultiConsumerGroupIndependent);
+ }
+ if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) {
+ runTest(
+ "testMultiTopicSubscription", ConsensusSubscriptionTableTest::testMultiTopicSubscription);
+ }
+ if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) {
+ runTest("testFlushDataDelivery", ConsensusSubscriptionTableTest::testFlushDataDelivery);
+ }
+ if (targetTest == null || "testCrossPartitionMultiWrite".equals(targetTest)) {
+ runTest(
+ "testCrossPartitionMultiWrite",
+ ConsensusSubscriptionTableTest::testCrossPartitionMultiWrite);
+ }
+
+ // Summary
+ System.out.println("\n=== Test Suite Summary ===");
+ System.out.println("Passed: " + passed);
+ System.out.println("Failed: " + failed);
+ if (!failedTests.isEmpty()) {
+ System.out.println("Failed tests: " + failedTests);
+ }
+ System.out.println("=== Done ===");
+ }
+
+ // ============================
+ // Test Infrastructure
+ // ============================
+
+ @FunctionalInterface
+ interface TestMethod {
+ void run() throws Exception;
+ }
+
+ private static void runTest(String name, TestMethod test) {
+ System.out.println("\n" + "=================================================================");
+ System.out.println("Running: " + name);
+ System.out.println("=================================================================");
+ try {
+ test.run();
+ passed++;
+ System.out.println(">>> PASSED: " + name);
+ } catch (AssertionError e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> FAILED: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ } catch (Exception e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> ERROR: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ }
+ }
+
+ private static String nextDatabase() {
+ testCounter++;
+ return "csub_tbl_" + testCounter;
+ }
+
+ private static String nextTopic() {
+ return "topic_tbl_" + testCounter;
+ }
+
+ private static String nextConsumerGroup() {
+ return "cg_tbl_" + testCounter;
+ }
+
+ private static String nextConsumerId() {
+ return "consumer_tbl_" + testCounter;
+ }
+
+ private static ITableSession openTableSession() throws Exception {
+ return new TableSessionBuilder()
+ .nodeUrls(Collections.singletonList(HOST + ":" + PORT))
+ .username(USER)
+ .password(PASSWORD)
+ .build();
+ }
+
+ private static void createDatabaseAndTable(
+ ITableSession session, String database, String tableName, String tableSchema)
+ throws Exception {
+ session.executeNonQueryStatement("CREATE DATABASE IF NOT EXISTS " + database);
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement(String.format("CREATE TABLE %s (%s)", tableName, tableSchema));
+ }
+
+ private static void deleteDatabase(String database) {
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("DROP DATABASE IF EXISTS " + database);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void dropTopicTable(String topicName) {
+ try (ISubscriptionTableSession subSession =
+ new SubscriptionTableSessionBuilder()
+ .host(HOST)
+ .port(PORT)
+ .username(USER)
+ .password(PASSWORD)
+ .build()) {
+ subSession.dropTopicIfExists(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void createTopicTable(String topicName, String dbKey, String tableKey)
+ throws Exception {
+ try (ISubscriptionTableSession subSession =
+ new SubscriptionTableSessionBuilder()
+ .host(HOST)
+ .port(PORT)
+ .username(USER)
+ .password(PASSWORD)
+ .build()) {
+ try {
+ subSession.dropTopicIfExists(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+
+ Properties topicConfig = new Properties();
+ topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE);
+ topicConfig.put(
+ TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE);
+ topicConfig.put(TopicConstant.DATABASE_KEY, dbKey);
+ topicConfig.put(TopicConstant.TABLE_KEY, tableKey);
+ subSession.createTopic(topicName, topicConfig);
+ System.out.println(
+ " Created topic: " + topicName + " (database=" + dbKey + ", table=" + tableKey + ")");
+ }
+ }
+
+ private static ISubscriptionTablePullConsumer createConsumer(
+ String consumerId, String consumerGroupId) throws Exception {
+ ISubscriptionTablePullConsumer consumer =
+ new SubscriptionTablePullConsumerBuilder()
+ .host(HOST)
+ .port(PORT)
+ .consumerId(consumerId)
+ .consumerGroupId(consumerGroupId)
+ .autoCommit(false)
+ .build();
+ consumer.open();
+ return consumer;
+ }
+
+ // ============================
+ // Polling & Verification
+ // ============================
+
+ /**
+ * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive
+ * empty rounds to verify no extra data arrives.
+ */
+ private static PollResult pollUntilComplete(
+ ISubscriptionTablePullConsumer consumer, int expectedRows, int maxPollAttempts) {
+ return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true);
+ }
+
+ /**
+ * Poll until we accumulate the expected number of rows, then verify no extra data arrives.
+ *
+ * After reaching expectedRows, continues polling until 5 consecutive empty polls confirm
+ * quiescence. Any extra rows polled are included in the count (will break assertEquals).
+ *
+ * @param commitMessages if false, messages are NOT committed
+ */
+ private static PollResult pollUntilComplete(
+ ISubscriptionTablePullConsumer consumer,
+ int expectedRows,
+ int maxPollAttempts,
+ long pollTimeoutMs,
+ boolean commitMessages) {
+ PollResult result = new PollResult();
+ int consecutiveEmpty = 0;
+
+ for (int attempt = 1; attempt <= maxPollAttempts; attempt++) {
+ List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs));
+
+ if (messages.isEmpty()) {
+ consecutiveEmpty++;
+ // Normal completion: reached expected rows and verified quiescence
+ if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) {
+ System.out.println(
+ " Verified: "
+ + consecutiveEmpty
+ + " consecutive empty polls after "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Stuck: have data but cannot reach expected count
+ if (consecutiveEmpty >= 5 && result.totalRows > 0) {
+ System.out.println(
+ " Stuck: "
+ + consecutiveEmpty
+ + " consecutive empty polls at "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Never received anything
+ if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) {
+ System.out.println(" No data received after " + consecutiveEmpty + " polls");
+ break;
+ }
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException ignored) {
+ }
+ continue;
+ }
+
+ consecutiveEmpty = 0;
+
+ for (SubscriptionMessage message : messages) {
+ for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) {
+ String tableName = dataSet.getTableName();
+ String databaseName = dataSet.getDatabaseName();
+ List columnNames = dataSet.getColumnNames();
+
+ while (dataSet.hasNext()) {
+ org.apache.tsfile.read.common.RowRecord record = dataSet.next();
+ result.totalRows++;
+ if (tableName != null) {
+ result.rowsPerTable.merge(tableName, 1, Integer::sum);
+ }
+ if (databaseName != null) {
+ result.rowsPerDatabase.merge(databaseName, 1, Integer::sum);
+ }
+ for (int i = 0; i < columnNames.size(); i++) {
+ result.seenColumns.add(columnNames.get(i));
+ }
+ if (result.totalRows <= 5) {
+ System.out.println(
+ " Row: time="
+ + record.getTimestamp()
+ + ", values="
+ + record.getFields()
+ + ", table="
+ + tableName
+ + ", database="
+ + databaseName);
+ }
+ }
+ }
+ if (commitMessages) {
+ consumer.commitSync(message);
+ }
+ }
+
+ System.out.println(
+ " Poll attempt "
+ + attempt
+ + ": totalRows="
+ + result.totalRows
+ + " / expected="
+ + expectedRows);
+
+ // Stop immediately if we exceeded the expected row count
+ if (expectedRows > 0 && result.totalRows > expectedRows) {
+ System.out.println(
+ " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows);
+ break;
+ }
+ }
+
+ return result;
+ }
+
+ // ============================
+ // Cleanup
+ // ============================
+
+ /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */
+ private static void cleanup(
+ ISubscriptionTablePullConsumer consumer, String topicName, String database) {
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopicTable(topicName);
+ deleteDatabase(database);
+ }
+
+ /** Clean up with multiple databases. */
+ private static void cleanup(
+ ISubscriptionTablePullConsumer consumer, String topicName, String... databases) {
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopicTable(topicName);
+ for (String db : databases) {
+ deleteDatabase(db);
+ }
+ }
+
+ // ============================
+ // Result & Assertions
+ // ============================
+
+ static class PollResult {
+ int totalRows = 0;
+ Map rowsPerTable = new HashMap<>();
+ Map rowsPerDatabase = new HashMap<>();
+ Set seenColumns = new HashSet<>();
+
+ @Override
+ public String toString() {
+ return "PollResult{totalRows="
+ + totalRows
+ + ", rowsPerTable="
+ + rowsPerTable
+ + ", rowsPerDatabase="
+ + rowsPerDatabase
+ + ", seenColumns="
+ + seenColumns
+ + "}";
+ }
+ }
+
+ private static void assertEquals(String msg, int expected, int actual) {
+ if (expected != actual) {
+ throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual);
+ }
+ }
+
+ private static void assertTrue(String msg, boolean condition) {
+ if (!condition) {
+ throw new AssertionError(msg);
+ }
+ }
+
+ private static void assertAtLeast(String msg, int min, int actual) {
+ if (actual < min) {
+ throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual);
+ }
+ }
+
+ // ============================
+ // Test 1: Basic Data Delivery
+ // ============================
+ /**
+ * Verifies the basic consensus subscription flow with table model: write before subscribe (not
+ * received), write after subscribe (received), and no extra data beyond expectation.
+ */
+ private static void testBasicDataDelivery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ // Step 1: Write initial data to create DataRegion
+ System.out.println(" Step 1: Writing initial data (should NOT be received)");
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(
+ session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD, s2 DOUBLE FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 0; i < 50; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)",
+ i * 10, i * 1.5, i));
+ }
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Step 2: Create topic and subscribe
+ System.out.println(" Step 2: Creating topic and subscribing");
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Step 3: Write new data AFTER subscription
+ System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 100; i < 200; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)",
+ i * 10, i * 1.5, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Step 4: Poll and verify exact count
+ System.out.println(" Step 4: Polling...");
+ PollResult result = pollUntilComplete(consumer, 100, 100);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 2: Multiple Data Types
+ // ============================
+ /**
+ * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using
+ * separate INSERT statements per type (one field per INSERT), and verifies all types are
+ * delivered.
+ */
+ private static void testMultipleDataTypes() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(
+ session,
+ database,
+ "t1",
+ "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, "
+ + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, "
+ + "s_text TEXT FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ // Write initial row to create DataRegion
+ session.executeNonQueryStatement(
+ "INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing data with 6 data types x 20 rows each");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= 20; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', %d, %d)", i, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int64, time) VALUES ('d1', %d, %d)",
+ (long) i * 100000L, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_float, time) VALUES ('d1', %f, %d)", i * 1.1f, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_double, time) VALUES ('d1', %f, %d)", i * 2.2, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_bool, time) VALUES ('d1', %s, %d)",
+ i % 2 == 0 ? "true" : "false", i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_text, time) VALUES ('d1', 'text_%d', %d)", i, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling...");
+ PollResult result = pollUntilComplete(consumer, 120, 120);
+ System.out.println(" Result: " + result);
+
+ assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows);
+ System.out.println(" Seen columns: " + result.seenColumns);
+ assertTrue(
+ "Expected multiple column types in result, got: " + result.seenColumns,
+ result.seenColumns.size() > 1);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 3: Table-Level Filtering
+ // ============================
+ /**
+ * Creates a topic that only matches table "t1" via TABLE_KEY. Verifies that data written to t2 is
+ * NOT delivered.
+ */
+ private static void testTableLevelFiltering() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic matches only table t1
+ createTopicTable(topicName, database, "t1");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing to both t1 and t2 (topic filter: t1 only)");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting only t1 data)...");
+ PollResult result = pollUntilComplete(consumer, 50, 60);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 50 rows from t1 only", 50, result.totalRows);
+ if (!result.rowsPerTable.isEmpty()) {
+ Integer t2Rows = result.rowsPerTable.get("t2");
+ assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0);
+ Integer t1Rows = result.rowsPerTable.get("t1");
+ assertAtLeast("Expected t1 rows", 1, t1Rows != null ? t1Rows : 0);
+ System.out.println(
+ " Table filtering verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows");
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 4: Database-Level Filtering
+ // ============================
+ /**
+ * Creates a topic that only matches database db1 via DATABASE_KEY. Verifies that data written to
+ * db2 is NOT delivered.
+ */
+ private static void testDatabaseLevelFiltering() throws Exception {
+ String database1 = nextDatabase();
+ String database2 = database1 + "_other";
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database1, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database1);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("USE " + database2);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic matches only database1
+ createTopicTable(topicName, database1, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(
+ " Writing to both "
+ + database1
+ + " and "
+ + database2
+ + " (topic filter: "
+ + database1
+ + " only)");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database1);
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ session.executeNonQueryStatement("USE " + database2);
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting only " + database1 + " data)...");
+ PollResult result = pollUntilComplete(consumer, 50, 60);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 50 rows from " + database1 + " only", 50, result.totalRows);
+ if (!result.rowsPerDatabase.isEmpty()) {
+ Integer db2Rows = result.rowsPerDatabase.get(database2);
+ assertTrue(
+ "Expected NO rows from " + database2 + ", but got " + db2Rows,
+ db2Rows == null || db2Rows == 0);
+ Integer db1Rows = result.rowsPerDatabase.get(database1);
+ assertAtLeast("Expected " + database1 + " rows", 1, db1Rows != null ? db1Rows : 0);
+ System.out.println(
+ " Database filtering verified: "
+ + database1
+ + "="
+ + db1Rows
+ + " rows, "
+ + database2
+ + "="
+ + db2Rows
+ + " rows");
+ }
+ } finally {
+ cleanup(consumer, topicName, database1, database2);
+ }
+ }
+
+ // ============================
+ // Test 5: Subscribe Before Region Creation
+ // ============================
+ /**
+ * Subscribe BEFORE the database/region exists, then create database and write. Tests the
+ * IoTConsensus.onNewPeerCreated auto-binding path with table model.
+ */
+ private static void testSubscribeBeforeRegion() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ System.out.println(" Step 1: Creating topic BEFORE database exists");
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ System.out.println(" Step 2: Subscribing (no DataRegion exists yet)");
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Step 3: Creating database, table and writing data (100 rows)");
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 0; i < 100; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ }
+ Thread.sleep(5000);
+
+ System.out.println(" Step 4: Polling (auto-binding should have picked up new region)...");
+ PollResult result = pollUntilComplete(consumer, 100, 100);
+ System.out.println(" Result: " + result);
+
+ if (result.totalRows >= 100) {
+ System.out.println(" Auto-binding works! All " + result.totalRows + " rows received.");
+ } else if (result.totalRows > 0) {
+ System.out.println(
+ " Partial: " + result.totalRows + "/100 rows. First writes may precede binding.");
+ } else {
+ System.out.println(" No data received. Check logs for auto-binding messages.");
+ }
+ assertAtLeast(
+ "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 6: Multiple Tables Aggregation
+ // ============================
+ /** Writes to t1, t2, t3 and verifies all are received via a broad topic TABLE_KEY. */
+ private static void testMultipleTablesAggregation() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing to 3 tables (t1, t2, t3), 30 rows each");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 100; i < 130; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting 90 total from 3 tables)...");
+ PollResult result = pollUntilComplete(consumer, 90, 100);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 90 rows total (30 per table)", 90, result.totalRows);
+ if (!result.rowsPerTable.isEmpty()) {
+ System.out.println(" Rows per table: " + result.rowsPerTable);
+ for (String tbl : new String[] {"t1", "t2", "t3"}) {
+ Integer tblRows = result.rowsPerTable.get(tbl);
+ assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0);
+ }
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 7: Multi Column Types (Table Model Equivalent of Aligned Timeseries)
+ // ============================
+ /**
+ * Creates a table with 6 different FIELD types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and
+ * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are
+ * delivered correctly. This is the table model equivalent of the aligned timeseries test.
+ */
+ private static void testMultiColumnTypes() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ // Create table with multiple field types
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(
+ session,
+ database,
+ "t1",
+ "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, "
+ + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, "
+ + "s_text TEXT FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ // Write initial row to force DataRegion creation
+ session.executeNonQueryStatement(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 rows, each with all 6 data types in a single INSERT
+ System.out.println(" Writing 50 rows with 6 data types per row");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)"
+ + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)",
+ i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling...");
+ PollResult result = pollUntilComplete(consumer, 50, 70);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 50 rows with all field types", 50, result.totalRows);
+ // Verify we see columns for multiple data types
+ System.out.println(" Seen columns: " + result.seenColumns);
+ assertAtLeast(
+ "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size());
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 8: Poll Without Commit (Re-delivery)
+ // ============================
+ /**
+ * Tests at-least-once delivery with a mixed commit/no-commit pattern.
+ *
+ * Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we
+ * track committed ROWS (not events). The state machine alternates:
+ *
+ *
+ * Even-numbered rounds: poll WITHOUT commit, record ALL timestamps from the event; next
+ * poll verifies the EXACT SAME timestamps are re-delivered, then commit.
+ * Odd-numbered rounds: poll and commit directly; next poll should deliver DIFFERENT data.
+ *
+ *
+ * This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal
+ * commit path in an interleaved fashion.
+ */
+ private static void testPollWithoutCommit() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 rows
+ final int totalRows = 50;
+ System.out.println(" Writing " + totalRows + " rows");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= totalRows; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ }
+ Thread.sleep(3000);
+
+ // State machine: alternate between skip-commit and direct-commit.
+ int totalRowsCommitted = 0;
+ int roundNumber = 0;
+ boolean hasPending = false;
+ List pendingTimestamps = new ArrayList<>();
+ Set allCommittedTimestamps = new HashSet<>();
+ int redeliveryCount = 0;
+
+ for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(5000));
+ if (msgs.isEmpty()) {
+ Thread.sleep(1000);
+ continue;
+ }
+
+ for (SubscriptionMessage msg : msgs) {
+ // Extract ALL timestamps from this event
+ List currentTimestamps = new ArrayList<>();
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ currentTimestamps.add(ds.next().getTimestamp());
+ }
+ }
+ assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0);
+
+ if (hasPending) {
+ // === Re-delivery round: verify EXACT same timestamps ===
+ assertTrue(
+ "Re-delivery timestamp list mismatch: expected="
+ + pendingTimestamps
+ + ", actual="
+ + currentTimestamps,
+ currentTimestamps.equals(pendingTimestamps));
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ hasPending = false;
+ redeliveryCount++;
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] Re-delivered & committed: timestamps="
+ + currentTimestamps);
+ } else {
+ // === New event round ===
+ if (totalRowsCommitted > 0) {
+ boolean overlap = false;
+ for (Long ts : currentTimestamps) {
+ if (allCommittedTimestamps.contains(ts)) {
+ overlap = true;
+ break;
+ }
+ }
+ assertTrue(
+ "After commit, should receive different data (timestamps="
+ + currentTimestamps
+ + " overlap with committed="
+ + allCommittedTimestamps
+ + ")",
+ !overlap);
+ }
+
+ if (roundNumber % 2 == 0) {
+ pendingTimestamps = new ArrayList<>(currentTimestamps);
+ hasPending = true;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (NOT committed): timestamps="
+ + currentTimestamps);
+ } else {
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (committed directly): timestamps="
+ + currentTimestamps);
+ }
+ }
+ }
+ }
+
+ assertEquals("Should have committed all rows", totalRows, totalRowsCommitted);
+ assertTrue(
+ "Should have at least 1 re-delivery round (got " + redeliveryCount + ")",
+ redeliveryCount > 0);
+
+ // Final poll: should be empty
+ System.out.println(" Final poll: expecting no data");
+ int extraRows = 0;
+ for (int i = 0; i < 3; i++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ for (SubscriptionMessage msg : msgs) {
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ extraRows++;
+ }
+ }
+ }
+ }
+ assertEquals("After all committed, should receive no more data", 0, extraRows);
+
+ System.out.println(
+ " At-least-once re-delivery verified: "
+ + totalRows
+ + " rows committed with "
+ + redeliveryCount
+ + " re-delivery rounds");
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 9: Multi Consumer Group Independent Consumption
+ // ============================
+ /**
+ * Two consumer groups subscribe to the same topic. Verifies that each group independently
+ * receives ALL data (data is not partitioned/split between groups).
+ */
+ private static void testMultiConsumerGroupIndependent() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId1 = "cg_tbl_multi_" + testCounter + "_a";
+ String consumerId1 = "consumer_tbl_multi_" + testCounter + "_a";
+ String consumerGroupId2 = "cg_tbl_multi_" + testCounter + "_b";
+ String consumerId2 = "consumer_tbl_multi_" + testCounter + "_b";
+ ISubscriptionTablePullConsumer consumer1 = null;
+ ISubscriptionTablePullConsumer consumer2 = null;
+
+ try {
+ // Create database and initial data
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ // Two consumers in different groups both subscribe to the same topic
+ consumer1 = createConsumer(consumerId1, consumerGroupId1);
+ consumer1.subscribe(topicName);
+ consumer2 = createConsumer(consumerId2, consumerGroupId2);
+ consumer2.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 rows
+ System.out.println(" Writing 50 rows");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Poll from group 1
+ System.out.println(" Polling from consumer group 1...");
+ PollResult result1 = pollUntilComplete(consumer1, 50, 70);
+ System.out.println(" Group 1 result: " + result1);
+
+ // Poll from group 2
+ System.out.println(" Polling from consumer group 2...");
+ PollResult result2 = pollUntilComplete(consumer2, 50, 70);
+ System.out.println(" Group 2 result: " + result2);
+
+ // Both groups should have all 50 rows
+ assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows);
+ assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows);
+ System.out.println(
+ " Independent consumption verified: group1="
+ + result1.totalRows
+ + ", group2="
+ + result2.totalRows);
+ } finally {
+ // Clean up both consumers
+ if (consumer1 != null) {
+ try {
+ consumer1.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer1.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ if (consumer2 != null) {
+ try {
+ consumer2.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer2.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopicTable(topicName);
+ deleteDatabase(database);
+ }
+ }
+
+ // ============================
+ // Test 10: Multi Topic Subscription
+ // ============================
+ /**
+ * One consumer subscribes to two different topics with different TABLE_KEY filters. Verifies that
+ * each topic delivers only its matching data, and no cross-contamination occurs.
+ */
+ private static void testMultiTopicSubscription() throws Exception {
+ String database = nextDatabase();
+ String topicName1 = "topic_tbl_multi_" + testCounter + "_a";
+ String topicName2 = "topic_tbl_multi_" + testCounter + "_b";
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ // Create database with two tables
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic 1: covers t1 only
+ createTopicTable(topicName1, database, "t1");
+ // Topic 2: covers t2 only
+ createTopicTable(topicName2, database, "t2");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName1, topicName2);
+ Thread.sleep(3000);
+
+ // Write 30 rows to t1 and 40 rows to t2
+ System.out.println(" Writing 30 rows to t1, 40 rows to t2");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= 40; i++) {
+ if (i <= 30) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Poll all data — should get t1 rows (via topic1) + t2 rows (via topic2)
+ System.out.println(" Polling (expecting 30 from t1 + 40 from t2 = 70 total)...");
+ PollResult result = pollUntilComplete(consumer, 70, 80);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 70 rows total (30 t1 + 40 t2)", 70, result.totalRows);
+ if (!result.rowsPerTable.isEmpty()) {
+ Integer t1Rows = result.rowsPerTable.get("t1");
+ Integer t2Rows = result.rowsPerTable.get("t2");
+ assertEquals("Expected 30 rows from t1", 30, t1Rows != null ? t1Rows : 0);
+ assertEquals("Expected 40 rows from t2", 40, t2Rows != null ? t2Rows : 0);
+ System.out.println(
+ " Multi-topic isolation verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows");
+ }
+ } finally {
+ // Clean up consumer, both topics, and database
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName1, topicName2);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopicTable(topicName1);
+ dropTopicTable(topicName2);
+ deleteDatabase(database);
+ }
+ }
+
+ // ============================
+ // Test 12: Cross-Partition Multi-Write
+ // ============================
+ /**
+ * Tests that cross-partition writes via all table model write methods are correctly delivered.
+ *
+ * Uses timestamps spaced >1 week apart (default partition interval = 604,800,000ms) to force
+ * cross-partition distribution. Exercises three write paths:
+ *
+ *
+ * Method 1: SQL single-row INSERT (2 rows, separate partitions)
+ * Method 2: SQL multi-row INSERT (3 rows spanning 3 partitions in one statement)
+ * Method 3: session.insert(Tablet) with 4 rows spanning 4 partitions
+ *
+ *
+ * The table has 6 FIELD columns (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) plus 1 TAG. Total
+ * expected rows: 2 + 3 + 4 = 9.
+ *
+ *
This test verifies that when a SQL multi-row INSERT or Tablet write spans multiple time
+ * partitions (causing the plan node to be split into sub-nodes for each partition), all sub-nodes
+ * are correctly converted by the consensus subscription pipeline.
+ */
+ private static void testCrossPartitionMultiWrite() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ // Gap > default time partition interval (7 days = 604,800,000ms)
+ final long GAP = 604_800_001L;
+ final String TABLE = "t1";
+ final String SCHEMA =
+ "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, "
+ + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, "
+ + "s_text TEXT FIELD";
+
+ try {
+ // Create database and table, write init row to force DataRegion creation
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, TABLE, SCHEMA);
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing cross-partition data via 3 methods...");
+
+ // --- Method 1: SQL single-row INSERT (2 rows, each in its own partition) ---
+ long baseTs = 1_000_000_000L;
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ long ts1 = baseTs;
+ long ts2 = baseTs + GAP;
+ System.out.println(" Method 1: SQL single-row x2 (ts=" + ts1 + ", " + ts2 + ")");
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'sql_single_1', %d)",
+ ts1));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'sql_single_2', %d)",
+ ts2));
+ }
+
+ // --- Method 2: SQL multi-row INSERT (3 rows spanning 3 different partitions) ---
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ long t1 = baseTs + GAP * 2;
+ long t2 = baseTs + GAP * 3;
+ long t3 = baseTs + GAP * 4;
+ System.out.println(
+ " Method 2: SQL multi-row x3 (ts=" + t1 + ", " + t2 + ", " + t3 + ")");
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'sql_multi_1', %d), "
+ + "('d1', 4, 400, 4.4, 4.44, false, 'sql_multi_2', %d), "
+ + "('d1', 5, 500, 5.5, 5.55, true, 'sql_multi_3', %d)",
+ t1, t2, t3));
+ }
+
+ // --- Method 3: session.insert(Tablet) with 4 rows spanning 4 partitions ---
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+
+ List schemaList = new ArrayList<>();
+ schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING));
+ schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32));
+ schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64));
+ schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
+ schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
+ schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
+ schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING));
+
+ List categories =
+ java.util.Arrays.asList(
+ ColumnCategory.TAG,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD);
+
+ Tablet tablet =
+ new Tablet(
+ TABLE,
+ IMeasurementSchema.getMeasurementNameList(schemaList),
+ IMeasurementSchema.getDataTypeList(schemaList),
+ categories,
+ 10);
+
+ for (int i = 0; i < 4; i++) {
+ int row = tablet.getRowSize();
+ long ts = baseTs + GAP * (5 + i); // partitions 5, 6, 7, 8
+ tablet.addTimestamp(row, ts);
+ tablet.addValue("tag1", row, "d1");
+ tablet.addValue("s_int32", row, 6 + i);
+ tablet.addValue("s_int64", row, (long) (600 + i * 100));
+ tablet.addValue("s_float", row, (6 + i) * 1.1f);
+ tablet.addValue("s_double", row, (6 + i) * 2.22);
+ tablet.addValue("s_bool", row, i % 2 == 0);
+ tablet.addValue("s_text", row, "tablet_" + (i + 1));
+ }
+ System.out.println(
+ " Method 3: Tablet x4 (ts=" + (baseTs + GAP * 5) + ".." + (baseTs + GAP * 8) + ")");
+ session.insert(tablet);
+ }
+
+ Thread.sleep(2000);
+
+ // Poll — expect 9 rows total (2 + 3 + 4)
+ final int expectedRows = 9;
+ System.out.println(" Polling (expecting " + expectedRows + " rows)...");
+ PollResult result = pollUntilComplete(consumer, expectedRows, 80);
+ System.out.println(" Result: " + result);
+
+ assertEquals(
+ "Expected exactly " + expectedRows + " cross-partition rows",
+ expectedRows,
+ result.totalRows);
+ // Verify we see all 6 FIELD columns plus tag
+ assertAtLeast(
+ "Expected at least 6 data columns in cross-partition result",
+ 6,
+ result.seenColumns.size());
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 11: Flush Data Delivery
+ // ============================
+ /**
+ * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable
+ * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps
+ * entries available until committed by the subscription consumer.
+ */
+ private static void testFlushDataDelivery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ ISubscriptionTablePullConsumer consumer = null;
+
+ try {
+ try (ITableSession session = openTableSession()) {
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
+ session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopicTable(topicName, database, ".*");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 rows, then flush before polling
+ System.out.println(" Writing 50 rows then flushing");
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
+ System.out.println(" Flushing...");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Poll — all 50 rows should be delivered despite flush
+ System.out.println(" Polling after flush...");
+ PollResult result = pollUntilComplete(consumer, 50, 70);
+ System.out.println(" Result: " + result);
+ assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+}
diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java
new file mode 100644
index 0000000000000..1ab7a910c0324
--- /dev/null
+++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java
@@ -0,0 +1,1460 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb;
+
+import org.apache.iotdb.isession.ISession;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
+import org.apache.iotdb.session.Session;
+import org.apache.iotdb.session.subscription.SubscriptionTreeSession;
+import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer;
+import org.apache.iotdb.session.subscription.payload.SubscriptionMessage;
+import org.apache.iotdb.session.subscription.payload.SubscriptionSessionDataSet;
+
+import org.apache.tsfile.common.conf.TSFileConfig;
+import org.apache.tsfile.enums.TSDataType;
+import org.apache.tsfile.utils.Binary;
+import org.apache.tsfile.write.record.Tablet;
+import org.apache.tsfile.write.schema.IMeasurementSchema;
+import org.apache.tsfile.write.schema.MeasurementSchema;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+/** TODO: move these manual tests into ITs */
+public class ConsensusSubscriptionTest {
+
+ private static final String HOST = "127.0.0.1";
+ private static final int PORT = 6667;
+ private static final String USER = "root";
+ private static final String PASSWORD = "root";
+
+ private static int testCounter = 0;
+ private static int passed = 0;
+ private static int failed = 0;
+ private static final List failedTests = new ArrayList<>();
+
+ public static void main(String[] args) throws Exception {
+ System.out.println("=== Consensus-Based Subscription Test Suite ===\n");
+
+ String targetTest = args.length > 0 ? args[0] : null;
+
+ if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) {
+ runTest("testBasicDataDelivery", ConsensusSubscriptionTest::testBasicDataDelivery);
+ }
+ if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) {
+ runTest("testMultipleDataTypes", ConsensusSubscriptionTest::testMultipleDataTypes);
+ }
+ if (targetTest == null || "testDeviceLevelFiltering".equals(targetTest)) {
+ runTest("testDeviceLevelFiltering", ConsensusSubscriptionTest::testDeviceLevelFiltering);
+ }
+ if (targetTest == null || "testTimeseriesLevelFiltering".equals(targetTest)) {
+ runTest(
+ "testTimeseriesLevelFiltering", ConsensusSubscriptionTest::testTimeseriesLevelFiltering);
+ }
+ if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) {
+ runTest("testSubscribeBeforeRegion", ConsensusSubscriptionTest::testSubscribeBeforeRegion);
+ }
+ if (targetTest == null || "testMultipleDevicesAggregation".equals(targetTest)) {
+ runTest(
+ "testMultipleDevicesAggregation",
+ ConsensusSubscriptionTest::testMultipleDevicesAggregation);
+ }
+ if (targetTest == null || "testAlignedTimeseries".equals(targetTest)) {
+ runTest("testAlignedTimeseries", ConsensusSubscriptionTest::testAlignedTimeseries);
+ }
+ if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) {
+ runTest("testPollWithoutCommit", ConsensusSubscriptionTest::testPollWithoutCommit);
+ }
+ if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) {
+ runTest(
+ "testMultiConsumerGroupIndependent",
+ ConsensusSubscriptionTest::testMultiConsumerGroupIndependent);
+ }
+ if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) {
+ runTest("testMultiTopicSubscription", ConsensusSubscriptionTest::testMultiTopicSubscription);
+ }
+ if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) {
+ runTest("testFlushDataDelivery", ConsensusSubscriptionTest::testFlushDataDelivery);
+ }
+ if (targetTest == null || "testCrossPartitionAligned".equals(targetTest)) {
+ runTest("testCrossPartitionAligned", ConsensusSubscriptionTest::testCrossPartitionAligned);
+ }
+
+ // Summary
+ System.out.println("\n=== Test Suite Summary ===");
+ System.out.println("Passed: " + passed);
+ System.out.println("Failed: " + failed);
+ if (!failedTests.isEmpty()) {
+ System.out.println("Failed tests: " + failedTests);
+ }
+ System.out.println("=== Done ===");
+ }
+
+ // ============================
+ // Test Infrastructure
+ // ============================
+
+ @FunctionalInterface
+ interface TestMethod {
+ void run() throws Exception;
+ }
+
+ private static void runTest(String name, TestMethod test) {
+ System.out.println("\n" + "=================================================================");
+ System.out.println("Running: " + name);
+ System.out.println("=================================================================");
+ try {
+ test.run();
+ passed++;
+ System.out.println(">>> PASSED: " + name);
+ } catch (AssertionError e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> FAILED: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ } catch (Exception e) {
+ failed++;
+ failedTests.add(name);
+ System.out.println(">>> ERROR: " + name + " - " + e.getMessage());
+ e.printStackTrace(System.out);
+ }
+ }
+
+ private static String nextDatabase() {
+ testCounter++;
+ return "root.csub_test_" + testCounter;
+ }
+
+ private static String nextTopic() {
+ return "topic_csub_" + testCounter;
+ }
+
+ private static String nextConsumerGroup() {
+ return "cg_csub_" + testCounter;
+ }
+
+ private static String nextConsumerId() {
+ return "consumer_csub_" + testCounter;
+ }
+
+ private static ISession openSession() throws Exception {
+ ISession session =
+ new Session.Builder().host(HOST).port(PORT).username(USER).password(PASSWORD).build();
+ session.open();
+ return session;
+ }
+
+ private static void createDatabase(ISession session, String database) throws Exception {
+ try {
+ session.executeNonQueryStatement("CREATE DATABASE " + database);
+ } catch (Exception e) {
+ // ignore if already exists
+ }
+ }
+
+ private static void deleteDatabase(String database) {
+ try (ISession session = openSession()) {
+ session.executeNonQueryStatement("DELETE DATABASE " + database);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void dropTopic(String topicName) {
+ try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) {
+ subSession.open();
+ subSession.dropTopic(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+
+ private static void createTopic(String topicName, String path) throws Exception {
+ try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) {
+ subSession.open();
+ try {
+ subSession.dropTopic(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+
+ Properties topicConfig = new Properties();
+ topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE);
+ topicConfig.put(
+ TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE);
+ topicConfig.put(TopicConstant.PATH_KEY, path);
+ subSession.createTopic(topicName, topicConfig);
+ System.out.println(" Created topic: " + topicName + " (path=" + path + ")");
+ }
+ }
+
+ private static SubscriptionTreePullConsumer createConsumer(
+ String consumerId, String consumerGroupId) throws Exception {
+ SubscriptionTreePullConsumer consumer =
+ new SubscriptionTreePullConsumer.Builder()
+ .host(HOST)
+ .port(PORT)
+ .consumerId(consumerId)
+ .consumerGroupId(consumerGroupId)
+ .autoCommit(false)
+ .buildPullConsumer();
+ consumer.open();
+ return consumer;
+ }
+
+ // ============================
+ // Polling & Verification
+ // ============================
+
+ /**
+ * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive
+ * empty rounds to verify no extra data arrives.
+ */
+ private static PollResult pollUntilComplete(
+ SubscriptionTreePullConsumer consumer, int expectedRows, int maxPollAttempts) {
+ return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true);
+ }
+
+ private static PollResult pollUntilComplete(
+ SubscriptionTreePullConsumer consumer,
+ int expectedRows,
+ int maxPollAttempts,
+ long pollTimeoutMs,
+ boolean commitMessages) {
+ PollResult result = new PollResult();
+ int consecutiveEmpty = 0;
+
+ for (int attempt = 1; attempt <= maxPollAttempts; attempt++) {
+ List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs));
+
+ if (messages.isEmpty()) {
+ consecutiveEmpty++;
+ // Normal completion: reached expected rows and verified quiescence
+ if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) {
+ System.out.println(
+ " Verified: "
+ + consecutiveEmpty
+ + " consecutive empty polls after "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Stuck: have data but cannot reach expected count
+ if (consecutiveEmpty >= 5 && result.totalRows > 0) {
+ System.out.println(
+ " Stuck: "
+ + consecutiveEmpty
+ + " consecutive empty polls at "
+ + result.totalRows
+ + " rows (expected "
+ + expectedRows
+ + ")");
+ break;
+ }
+ // Never received anything
+ if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) {
+ System.out.println(" No data received after " + consecutiveEmpty + " polls");
+ break;
+ }
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException ignored) {
+ }
+ continue;
+ }
+
+ consecutiveEmpty = 0;
+
+ for (SubscriptionMessage message : messages) {
+ for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) {
+ String device = null;
+ List columnNames = dataSet.getColumnNames();
+ if (columnNames.size() > 1) {
+ String fullPath = columnNames.get(1);
+ int lastDot = fullPath.lastIndexOf('.');
+ device = lastDot > 0 ? fullPath.substring(0, lastDot) : fullPath;
+ }
+
+ while (dataSet.hasNext()) {
+ org.apache.tsfile.read.common.RowRecord record = dataSet.next();
+ result.totalRows++;
+ if (device != null) {
+ result.rowsPerDevice.merge(device, 1, Integer::sum);
+ }
+ for (int i = 1; i < columnNames.size(); i++) {
+ result.seenColumns.add(columnNames.get(i));
+ }
+ if (result.totalRows <= 5) {
+ System.out.println(
+ " Row: time="
+ + record.getTimestamp()
+ + ", values="
+ + record.getFields()
+ + ", device="
+ + device);
+ }
+ }
+ }
+ if (commitMessages) {
+ consumer.commitSync(message);
+ }
+ }
+
+ System.out.println(
+ " Poll attempt "
+ + attempt
+ + ": totalRows="
+ + result.totalRows
+ + " / expected="
+ + expectedRows);
+
+ // Stop immediately if we exceeded the expected row count
+ if (expectedRows > 0 && result.totalRows > expectedRows) {
+ System.out.println(
+ " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows);
+ break;
+ }
+ }
+
+ return result;
+ }
+
+ // ============================
+ // Cleanup
+ // ============================
+
+ /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */
+ private static void cleanup(
+ SubscriptionTreePullConsumer consumer, String topicName, String database) {
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopic(topicName);
+ deleteDatabase(database);
+ }
+
+ // ============================
+ // Result & Assertions
+ // ============================
+
+ static class PollResult {
+ int totalRows = 0;
+ Map rowsPerDevice = new HashMap<>();
+ Set seenColumns = new HashSet<>();
+
+ @Override
+ public String toString() {
+ return "PollResult{totalRows="
+ + totalRows
+ + ", rowsPerDevice="
+ + rowsPerDevice
+ + ", seenColumns="
+ + seenColumns
+ + "}";
+ }
+ }
+
+ private static void assertEquals(String msg, int expected, int actual) {
+ if (expected != actual) {
+ throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual);
+ }
+ }
+
+ private static void assertTrue(String msg, boolean condition) {
+ if (!condition) {
+ throw new AssertionError(msg);
+ }
+ }
+
+ private static void assertAtLeast(String msg, int min, int actual) {
+ if (actual < min) {
+ throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual);
+ }
+ }
+
+ // ============================
+ // Test 1: Basic Data Delivery
+ // ============================
+ /**
+ * Verifies the basic consensus subscription flow: write before subscribe (not received), write
+ * after subscribe (received), and no extra data beyond expectation.
+ */
+ private static void testBasicDataDelivery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ // Step 1: Write initial data to create DataRegion
+ System.out.println(" Step 1: Writing initial data (should NOT be received)");
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ for (int i = 0; i < 50; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)",
+ database, i, i * 10, i * 1.5));
+ }
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Step 2: Create topic and subscribe
+ System.out.println(" Step 2: Creating topic and subscribing");
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Step 3: Write new data AFTER subscription
+ System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)");
+ try (ISession session = openSession()) {
+ for (int i = 100; i < 200; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)",
+ database, i, i * 10, i * 1.5));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Step 4: Poll and verify exact count (also verifies no extra data)
+ System.out.println(" Step 4: Polling...");
+ PollResult result = pollUntilComplete(consumer, 100, 100);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 2: Multiple Data Types (Non-Aligned)
+ // ============================
+ /**
+ * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using
+ * separate INSERT statements per type (non-aligned), and verifies all types are delivered.
+ */
+ private static void testMultipleDataTypes() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s_int32) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing data with 6 data types x 20 rows each");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= 20; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s_int32) VALUES (%d, %d)", database, i, i));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_int64) VALUES (%d, %d)",
+ database, i, (long) i * 100000L));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_float) VALUES (%d, %f)", database, i, i * 1.1f));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_double) VALUES (%d, %f)", database, i, i * 2.2));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_bool) VALUES (%d, %s)",
+ database, i, i % 2 == 0 ? "true" : "false"));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s_text) VALUES (%d, 'text_%d')", database, i, i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling...");
+ PollResult result = pollUntilComplete(consumer, 120, 120);
+ System.out.println(" Result: " + result);
+
+ assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows);
+ System.out.println(" Seen columns: " + result.seenColumns);
+ assertTrue(
+ "Expected multiple column types in result, got: " + result.seenColumns,
+ result.seenColumns.size() > 1);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 3: Device-Level Filtering
+ // ============================
+ /**
+ * Creates a topic that only matches root.db.d1.** and verifies that data written to d2 is NOT
+ * delivered.
+ */
+ private static void testDeviceLevelFiltering() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ String filterPath = database + ".d1.**";
+ createTopic(topicName, filterPath);
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing to both d1 and d2 (topic filter: d1.** only)");
+ try (ISession session = openSession()) {
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting only d1 data)...");
+ PollResult result = pollUntilComplete(consumer, 50, 60);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 50 rows from d1 only", 50, result.totalRows);
+ if (!result.rowsPerDevice.isEmpty()) {
+ Integer d2Rows = result.rowsPerDevice.get(database + ".d2");
+ assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0);
+ Integer d1Rows = result.rowsPerDevice.get(database + ".d1");
+ assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0);
+ System.out.println(
+ " Device filtering verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows");
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 4: Timeseries-Level Filtering
+ // ============================
+ /**
+ * Creates a topic matching root.db.d1.s1 only. Tests whether the converter filters at measurement
+ * level. Lenient: if both s1 and s2 arrive, reports device-level-only filtering.
+ */
+ private static void testTimeseriesLevelFiltering() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1, s2) VALUES (0, 0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ String filterPath = database + ".d1.s1";
+ createTopic(topicName, filterPath);
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing to d1.s1 and d1.s2 (topic filter: d1.s1 only)");
+ try (ISession session = openSession()) {
+ for (int i = 100; i < 150; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %d)",
+ database, i, i * 10, i * 20));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting only s1 data)...");
+ PollResult result = pollUntilComplete(consumer, 50, 60);
+ System.out.println(" Result: " + result);
+
+ System.out.println(" Seen columns: " + result.seenColumns);
+ boolean hasS2 = result.seenColumns.stream().anyMatch(c -> c.contains(".s2"));
+ if (hasS2) {
+ System.out.println(
+ " INFO: Both s1 and s2 received — converter uses device-level filtering only.");
+ assertAtLeast("Should have received some rows", 50, result.totalRows);
+ } else {
+ System.out.println(" Timeseries-level filtering verified: only s1 data received");
+ assertEquals("Expected exactly 50 rows from s1 only", 50, result.totalRows);
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 5: Subscribe Before Region Creation
+ // ============================
+ /**
+ * Subscribe BEFORE the database/region exists, then create database and write. Tests the
+ * IoTConsensus.onNewPeerCreated auto-binding path.
+ */
+ private static void testSubscribeBeforeRegion() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ System.out.println(" Step 1: Creating topic BEFORE database exists");
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ System.out.println(" Step 2: Subscribing (no DataRegion exists yet)");
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Step 3: Creating database and writing data (100 rows)");
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ for (int i = 0; i < 100; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ }
+ Thread.sleep(5000);
+
+ System.out.println(" Step 4: Polling (auto-binding should have picked up new region)...");
+ PollResult result = pollUntilComplete(consumer, 100, 100);
+ System.out.println(" Result: " + result);
+
+ if (result.totalRows >= 100) {
+ System.out.println(" Auto-binding works! All " + result.totalRows + " rows received.");
+ } else if (result.totalRows > 0) {
+ System.out.println(
+ " Partial: " + result.totalRows + "/100 rows. First writes may precede binding.");
+ } else {
+ System.out.println(" No data received. Check logs for auto-binding messages.");
+ }
+ assertAtLeast(
+ "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 6: Multiple Devices Aggregation
+ // ============================
+ /** Writes to d1, d2, d3 and verifies all are received via a broad topic path. */
+ private static void testMultipleDevicesAggregation() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ System.out.println(" Writing to 3 devices (d1, d2, d3), 30 rows each");
+ try (ISession session = openSession()) {
+ for (int i = 100; i < 130; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling (expecting 90 total from 3 devices)...");
+ PollResult result = pollUntilComplete(consumer, 90, 100);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 90 rows total (30 per device)", 90, result.totalRows);
+ if (!result.rowsPerDevice.isEmpty()) {
+ System.out.println(" Rows per device: " + result.rowsPerDevice);
+ for (String dev : new String[] {"d1", "d2", "d3"}) {
+ Integer devRows = result.rowsPerDevice.get(database + "." + dev);
+ assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0);
+ }
+ }
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 7: Aligned Timeseries
+ // ============================
+ /**
+ * Creates aligned timeseries with 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and
+ * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are
+ * delivered correctly.
+ */
+ private static void testAlignedTimeseries() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ // Create aligned timeseries with multiple data types
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format(
+ "CREATE ALIGNED TIMESERIES %s.d_aligned"
+ + "(s_int32 INT32, s_int64 INT64, s_float FLOAT,"
+ + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)",
+ database));
+ // Write initial row to force DataRegion creation
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')",
+ database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 aligned rows, each with all 6 data types in a single INSERT
+ System.out.println(" Writing 50 aligned rows with 6 data types per row");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (%d, %d, %d, %f, %f, %s, 'text_%d')",
+ database,
+ i,
+ i,
+ (long) i * 100000L,
+ i * 1.1f,
+ i * 2.2,
+ i % 2 == 0 ? "true" : "false",
+ i));
+ }
+ }
+ Thread.sleep(2000);
+
+ System.out.println(" Polling...");
+ PollResult result = pollUntilComplete(consumer, 50, 70);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 50 aligned rows", 50, result.totalRows);
+ // Verify we see columns for multiple data types
+ System.out.println(" Seen columns: " + result.seenColumns);
+ assertAtLeast(
+ "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size());
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 8: Poll Without Commit (Re-delivery)
+ // ============================
+ /**
+ * Tests at-least-once delivery with a mixed commit/no-commit pattern.
+ *
+ * Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we
+ * track committed ROWS (not events). The state machine alternates:
+ *
+ *
+ * Even-numbered rounds: poll WITHOUT commit, record ALL timestamps from the event; next
+ * poll verifies the EXACT SAME timestamps are re-delivered, then commit.
+ * Odd-numbered rounds: poll and commit directly; next poll should deliver DIFFERENT data.
+ *
+ *
+ * This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal
+ * commit path in an interleaved fashion.
+ */
+ private static void testPollWithoutCommit() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 rows (may be batched into fewer events by the prefetching thread)
+ final int totalRows = 50;
+ System.out.println(" Writing " + totalRows + " rows");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= totalRows; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ }
+ Thread.sleep(3000);
+
+ // State machine: alternate between skip-commit and direct-commit.
+ // Track committed ROWS (not events) because batching is unpredictable.
+ int totalRowsCommitted = 0;
+ int roundNumber = 0; // counts distinct events seen (used for alternation)
+ boolean hasPending = false;
+ List pendingTimestamps = new ArrayList<>(); // timestamps from the uncommitted event
+ Set allCommittedTimestamps = new HashSet<>(); // all timestamps ever committed
+ int redeliveryCount = 0;
+
+ for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(5000));
+ if (msgs.isEmpty()) {
+ Thread.sleep(1000);
+ continue;
+ }
+
+ for (SubscriptionMessage msg : msgs) {
+ // Extract ALL timestamps from this event (may contain multiple rows)
+ List currentTimestamps = new ArrayList<>();
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ currentTimestamps.add(ds.next().getTimestamp());
+ }
+ }
+ assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0);
+
+ if (hasPending) {
+ // === Re-delivery round: verify EXACT same timestamps ===
+ assertTrue(
+ "Re-delivery timestamp list mismatch: expected="
+ + pendingTimestamps
+ + ", actual="
+ + currentTimestamps,
+ currentTimestamps.equals(pendingTimestamps));
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ hasPending = false;
+ redeliveryCount++;
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] Re-delivered & committed: timestamps="
+ + currentTimestamps);
+ } else {
+ // === New event round ===
+ // After a commit, verify this is DIFFERENT data (no overlap with committed set)
+ if (totalRowsCommitted > 0) {
+ boolean overlap = false;
+ for (Long ts : currentTimestamps) {
+ if (allCommittedTimestamps.contains(ts)) {
+ overlap = true;
+ break;
+ }
+ }
+ assertTrue(
+ "After commit, should receive different data (timestamps="
+ + currentTimestamps
+ + " overlap with committed="
+ + allCommittedTimestamps
+ + ")",
+ !overlap);
+ }
+
+ // Even-numbered rounds: skip commit (test re-delivery)
+ // Odd-numbered rounds: commit directly (test normal flow)
+ if (roundNumber % 2 == 0) {
+ pendingTimestamps = new ArrayList<>(currentTimestamps);
+ hasPending = true;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (NOT committed): timestamps="
+ + currentTimestamps);
+ } else {
+ consumer.commitSync(msg);
+ totalRowsCommitted += currentTimestamps.size();
+ allCommittedTimestamps.addAll(currentTimestamps);
+ roundNumber++;
+ System.out.println(
+ " [rows="
+ + totalRowsCommitted
+ + "/"
+ + totalRows
+ + "] New event (committed directly): timestamps="
+ + currentTimestamps);
+ }
+ }
+ }
+ }
+
+ assertEquals("Should have committed all rows", totalRows, totalRowsCommitted);
+ assertTrue(
+ "Should have at least 1 re-delivery round (got " + redeliveryCount + ")",
+ redeliveryCount > 0);
+
+ // Final poll: should be empty
+ System.out.println(" Final poll: expecting no data");
+ int extraRows = 0;
+ for (int i = 0; i < 3; i++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ for (SubscriptionMessage msg : msgs) {
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ extraRows++;
+ }
+ }
+ }
+ }
+ assertEquals("After all committed, should receive no more data", 0, extraRows);
+
+ System.out.println(
+ " At-least-once re-delivery verified: "
+ + totalRows
+ + " rows committed with "
+ + redeliveryCount
+ + " re-delivery rounds");
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 9: Multi Consumer Group Independent Consumption
+ // ============================
+ /**
+ * Two consumer groups subscribe to the same topic. Verifies that each group independently
+ * receives ALL data (data is not partitioned/split between groups).
+ */
+ private static void testMultiConsumerGroupIndependent() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId1 = "cg_multi_" + testCounter + "_a";
+ String consumerId1 = "consumer_multi_" + testCounter + "_a";
+ String consumerGroupId2 = "cg_multi_" + testCounter + "_b";
+ String consumerId2 = "consumer_multi_" + testCounter + "_b";
+ SubscriptionTreePullConsumer consumer1 = null;
+ SubscriptionTreePullConsumer consumer2 = null;
+
+ try {
+ // Create database and initial data
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ // Two consumers in different groups both subscribe to the same topic
+ consumer1 = createConsumer(consumerId1, consumerGroupId1);
+ consumer1.subscribe(topicName);
+ consumer2 = createConsumer(consumerId2, consumerGroupId2);
+ consumer2.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 rows
+ System.out.println(" Writing 50 rows");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Poll from group 1
+ System.out.println(" Polling from consumer group 1...");
+ PollResult result1 = pollUntilComplete(consumer1, 50, 70);
+ System.out.println(" Group 1 result: " + result1);
+
+ // Poll from group 2
+ System.out.println(" Polling from consumer group 2...");
+ PollResult result2 = pollUntilComplete(consumer2, 50, 70);
+ System.out.println(" Group 2 result: " + result2);
+
+ // Both groups should have all 50 rows
+ assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows);
+ assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows);
+ System.out.println(
+ " Independent consumption verified: group1="
+ + result1.totalRows
+ + ", group2="
+ + result2.totalRows);
+ } finally {
+ // Clean up both consumers
+ if (consumer1 != null) {
+ try {
+ consumer1.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer1.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ if (consumer2 != null) {
+ try {
+ consumer2.unsubscribe(topicName);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer2.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopic(topicName);
+ deleteDatabase(database);
+ }
+ }
+
+ // ============================
+ // Test 10: Multi Topic Subscription
+ // ============================
+ /**
+ * One consumer subscribes to two different topics with different path filters. Verifies that each
+ * topic delivers only its matching data, and no cross-contamination occurs.
+ */
+ private static void testMultiTopicSubscription() throws Exception {
+ String database = nextDatabase();
+ String topicName1 = "topic_multi_" + testCounter + "_a";
+ String topicName2 = "topic_multi_" + testCounter + "_b";
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ // Create database with two device groups
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Topic 1: covers d1 only
+ createTopic(topicName1, database + ".d1.**");
+ // Topic 2: covers d2 only
+ createTopic(topicName2, database + ".d2.**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName1, topicName2);
+ Thread.sleep(3000);
+
+ // Write 30 rows to d1 and 40 rows to d2
+ System.out.println(" Writing 30 rows to d1, 40 rows to d2");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= 40; i++) {
+ if (i <= 30) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
+ }
+ }
+ Thread.sleep(2000);
+
+ // Poll all data — should get d1 rows (via topic1) + d2 rows (via topic2)
+ System.out.println(" Polling (expecting 30 from d1 + 40 from d2 = 70 total)...");
+ PollResult result = pollUntilComplete(consumer, 70, 80);
+ System.out.println(" Result: " + result);
+
+ assertEquals("Expected exactly 70 rows total (30 d1 + 40 d2)", 70, result.totalRows);
+ if (!result.rowsPerDevice.isEmpty()) {
+ Integer d1Rows = result.rowsPerDevice.get(database + ".d1");
+ Integer d2Rows = result.rowsPerDevice.get(database + ".d2");
+ assertEquals("Expected 30 rows from d1", 30, d1Rows != null ? d1Rows : 0);
+ assertEquals("Expected 40 rows from d2", 40, d2Rows != null ? d2Rows : 0);
+ System.out.println(
+ " Multi-topic isolation verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows");
+ }
+ } finally {
+ // Clean up consumer, both topics, and database
+ if (consumer != null) {
+ try {
+ consumer.unsubscribe(topicName1, topicName2);
+ } catch (Exception e) {
+ // ignore
+ }
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ // ignore
+ }
+ }
+ dropTopic(topicName1);
+ dropTopic(topicName2);
+ deleteDatabase(database);
+ }
+ }
+
+ // ============================
+ // Test 11: Flush Data Delivery
+ // ============================
+ /**
+ * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable
+ * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps
+ * entries available until committed by the subscription consumer.
+ */
+ private static void testFlushDataDelivery() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ try {
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Write 50 rows, then flush before polling
+ System.out.println(" Writing 50 rows then flushing");
+ try (ISession session = openSession()) {
+ for (int i = 1; i <= 50; i++) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
+ System.out.println(" Flushing...");
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ // Poll — all 50 rows should be delivered despite flush
+ System.out.println(" Polling after flush...");
+ PollResult result = pollUntilComplete(consumer, 50, 70);
+ System.out.println(" Result: " + result);
+ assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows);
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ // ============================
+ // Test 12: Cross-Partition Aligned Timeseries (Multiple Write Methods)
+ // ============================
+ /**
+ * Tests cross-partition aligned timeseries with 6 data types, written via six different aligned
+ * methods. Timestamps are spaced >1 week apart to force different time partitions, exercising the
+ * WAL merge path for multi-partition inserts.
+ *
+ * Write methods (all aligned):
+ *
+ *
+ * SQL single row
+ * SQL multi-row (cross-partition)
+ * session.insertAlignedRecord (single row)
+ * session.insertAlignedRecordsOfOneDevice (cross-partition)
+ * session.insertAlignedTablet (cross-partition)
+ * session.insertAlignedTablets (cross-partition)
+ *
+ */
+ private static void testCrossPartitionAligned() throws Exception {
+ String database = nextDatabase();
+ String topicName = nextTopic();
+ String consumerGroupId = nextConsumerGroup();
+ String consumerId = nextConsumerId();
+ SubscriptionTreePullConsumer consumer = null;
+
+ // Gap slightly over 1 week (default partition interval = 604,800,000ms)
+ final long GAP = 604_800_001L;
+ final String device = database + ".d_aligned";
+
+ try {
+ // Create aligned timeseries with 6 data types
+ try (ISession session = openSession()) {
+ createDatabase(session, database);
+ session.executeNonQueryStatement(
+ String.format(
+ "CREATE ALIGNED TIMESERIES %s.d_aligned"
+ + "(s_int32 INT32, s_int64 INT64, s_float FLOAT,"
+ + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)",
+ database));
+ // Init row to force DataRegion creation
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')",
+ database));
+ session.executeNonQueryStatement("flush");
+ }
+ Thread.sleep(2000);
+
+ createTopic(topicName, database + ".**");
+ Thread.sleep(1000);
+
+ consumer = createConsumer(consumerId, consumerGroupId);
+ consumer.subscribe(topicName);
+ Thread.sleep(3000);
+
+ // Shared measurement info for Session API calls
+ List measurements =
+ Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text");
+ List types =
+ Arrays.asList(
+ TSDataType.INT32,
+ TSDataType.INT64,
+ TSDataType.FLOAT,
+ TSDataType.DOUBLE,
+ TSDataType.BOOLEAN,
+ TSDataType.TEXT);
+
+ // Shared schema for Tablet API calls
+ List schemas = new ArrayList<>();
+ schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32));
+ schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64));
+ schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
+ schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
+ schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
+ schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT));
+
+ System.out.println(" Writing cross-partition aligned data via 6 methods");
+ int totalExpected = 0;
+
+ try (ISession session = openSession()) {
+
+ // --- Method 1: SQL single row ---
+ long t1 = 1;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')",
+ database, t1));
+ totalExpected += 1;
+ System.out.println(" Method 1 (SQL single row): 1 row");
+
+ // --- Method 2: SQL multi-row (cross-partition, 2 rows >1 week apart) ---
+ long t2a = 1 + GAP;
+ long t2b = 1 + 2 * GAP;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a'),"
+ + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')",
+ database, t2a, t2b));
+ totalExpected += 2;
+ System.out.println(" Method 2 (SQL multi-row, cross-partition): 2 rows");
+
+ // --- Method 3: insertAlignedRecord (single row) ---
+ long t3 = 1 + 3 * GAP;
+ List values3 = Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single");
+ session.insertAlignedRecord(device, t3, measurements, types, values3);
+ totalExpected += 1;
+ System.out.println(" Method 3 (insertAlignedRecord): 1 row");
+
+ // --- Method 4: insertAlignedRecordsOfOneDevice (cross-partition, 2 rows) ---
+ long t4a = 1 + 4 * GAP;
+ long t4b = 1 + 5 * GAP;
+ session.insertAlignedRecordsOfOneDevice(
+ device,
+ Arrays.asList(t4a, t4b),
+ Arrays.asList(measurements, measurements),
+ Arrays.asList(types, types),
+ Arrays.asList(
+ Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"),
+ Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b")));
+ totalExpected += 2;
+ System.out.println(
+ " Method 4 (insertAlignedRecordsOfOneDevice, cross-partition): 2 rows");
+
+ // --- Method 5: insertAlignedTablet (cross-partition, 2 rows) ---
+ long t5a = 1 + 6 * GAP;
+ long t5b = 1 + 7 * GAP;
+ Tablet tablet5 = new Tablet(device, schemas, 2);
+ addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a");
+ addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b");
+ session.insertAlignedTablet(tablet5);
+ totalExpected += 2;
+ System.out.println(" Method 5 (insertAlignedTablet, cross-partition): 2 rows");
+
+ // --- Method 6: insertAlignedTablets (cross-partition, 2 rows) ---
+ long t6a = 1 + 8 * GAP;
+ long t6b = 1 + 9 * GAP;
+ Tablet tablet6 = new Tablet(device, schemas, 2);
+ addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a");
+ addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b");
+ Map tabletMap = new HashMap<>();
+ tabletMap.put(device, tablet6);
+ session.insertAlignedTablets(tabletMap);
+ totalExpected += 2;
+ System.out.println(" Method 6 (insertAlignedTablets, cross-partition): 2 rows");
+ }
+
+ System.out.println(" Total expected rows: " + totalExpected);
+ Thread.sleep(2000);
+
+ System.out.println(" Polling...");
+ PollResult result = pollUntilComplete(consumer, totalExpected, 100);
+ System.out.println(" Result: " + result);
+
+ assertEquals(
+ "Expected exactly " + totalExpected + " cross-partition aligned rows",
+ totalExpected,
+ result.totalRows);
+ assertAtLeast(
+ "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size());
+ } finally {
+ cleanup(consumer, topicName, database);
+ }
+ }
+
+ /** Helper: populate one row of an aligned Tablet with all 6 data types. */
+ private static void addAlignedTabletRow(
+ Tablet tablet,
+ int rowIndex,
+ long timestamp,
+ int intVal,
+ long longVal,
+ float floatVal,
+ double doubleVal,
+ boolean boolVal,
+ String textVal) {
+ tablet.addTimestamp(rowIndex, timestamp);
+ tablet.addValue("s_int32", rowIndex, intVal);
+ tablet.addValue("s_int64", rowIndex, longVal);
+ tablet.addValue("s_float", rowIndex, floatVal);
+ tablet.addValue("s_double", rowIndex, doubleVal);
+ tablet.addValue("s_bool", rowIndex, boolVal);
+ tablet.addValue("s_text", rowIndex, new Binary(textVal, TSFileConfig.STRING_CHARSET));
+ }
+}
diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java
index cb5edd8cd91a3..6b71d5b16f79a 100644
--- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java
+++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java
@@ -39,6 +39,7 @@
import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq;
import org.apache.iotdb.consensus.exception.ConsensusException;
import org.apache.iotdb.rpc.TSStatusCode;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
import org.apache.iotdb.rpc.subscription.exception.SubscriptionException;
import org.apache.tsfile.utils.ReadWriteIOUtils;
@@ -52,6 +53,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
+import java.util.Set;
import java.util.stream.Collectors;
public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndPipeProcedure {
@@ -66,6 +68,8 @@ public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndP
private AlterConsumerGroupProcedure alterConsumerGroupProcedure;
private List createPipeProcedures = new ArrayList<>();
+ private Set consensusTopicNames = new HashSet<>();
+
// TODO: remove this variable later
private final List alterTopicProcedures = new ArrayList<>(); // unused now
@@ -103,15 +107,41 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env)
alterConsumerGroupProcedure =
new AlterConsumerGroupProcedure(updatedConsumerGroupMeta, subscriptionInfo);
- // Construct CreatePipeProcedureV2s
+ // Construct CreatePipeProcedureV2s (for non-consensus topics)
for (final String topicName : subscribeReq.getTopicNames()) {
+ final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName);
+
+ // Check if this topic should use consensus subscription: mode is live, format is Tablet
+ final String topicMode =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE);
+ final String topicFormat =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE);
+ final boolean isConsensusBasedTopic =
+ TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode)
+ && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat);
+
+ if (isConsensusBasedTopic) {
+ // skip pipe creation
+ consensusTopicNames.add(topicName);
+ LOGGER.info(
+ "CreateSubscriptionProcedure: topic [{}] uses consensus-based subscription "
+ + "(mode={}, format={}), skipping pipe creation",
+ topicName,
+ topicMode,
+ topicFormat);
+ continue;
+ }
+
final String pipeName =
PipeStaticMeta.generateSubscriptionPipeName(topicName, consumerGroupId);
if (!subscriptionInfo.get().isTopicSubscribedByConsumerGroup(topicName, consumerGroupId)
// even if there existed subscription meta, if there is no corresponding pipe meta, it
// will try to create the pipe
|| !pipeTaskInfo.get().isPipeExisted(pipeName)) {
- final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName);
createPipeProcedures.add(
new CreatePipeProcedureV2(
new TCreatePipeReq()
@@ -177,20 +207,29 @@ protected void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env)
// Push consumer group meta to data nodes
alterConsumerGroupProcedure.executeFromOperateOnDataNodes(env);
- // Push pipe meta to data nodes
- final List pipeNames =
- createPipeProcedures.stream()
- .map(CreatePipeProcedureV2::getPipeName)
- .collect(Collectors.toList());
- final String exceptionMessage =
- AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe(
- null, pushMultiPipeMetaToDataNodes(pipeNames, env));
- if (!exceptionMessage.isEmpty()) {
- // throw exception instead of logging warn, do not rely on metadata synchronization
- throw new SubscriptionException(
- String.format(
- "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.",
- pipeNames, subscribeReq, exceptionMessage));
+ if (!consensusTopicNames.isEmpty()) {
+ LOGGER.info(
+ "CreateSubscriptionProcedure: consensus-based topics {} will be handled by DataNode "
+ + "via consumer group meta push (no pipe creation needed)",
+ consensusTopicNames);
+ }
+
+ // Push pipe meta to data nodes (only for non-consensus pipe-based topics)
+ if (!createPipeProcedures.isEmpty()) {
+ final List pipeNames =
+ createPipeProcedures.stream()
+ .map(CreatePipeProcedureV2::getPipeName)
+ .collect(Collectors.toList());
+ final String exceptionMessage =
+ AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe(
+ null, pushMultiPipeMetaToDataNodes(pipeNames, env));
+ if (!exceptionMessage.isEmpty()) {
+ // throw exception instead of logging warn, do not rely on metadata synchronization
+ throw new SubscriptionException(
+ String.format(
+ "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.",
+ pipeNames, subscribeReq, exceptionMessage));
+ }
}
}
@@ -297,6 +336,12 @@ public void serialize(final DataOutputStream stream) throws IOException {
} else {
ReadWriteIOUtils.write(false, stream);
}
+
+ // Serialize consensus topic names
+ ReadWriteIOUtils.write(consensusTopicNames.size(), stream);
+ for (final String consensusTopicName : consensusTopicNames) {
+ ReadWriteIOUtils.write(consensusTopicName, stream);
+ }
}
@Override
@@ -348,6 +393,14 @@ public void deserialize(final ByteBuffer byteBuffer) {
}
}
}
+
+ // Deserialize consensus topic names
+ if (byteBuffer.hasRemaining()) {
+ size = ReadWriteIOUtils.readInt(byteBuffer);
+ for (int i = 0; i < size; ++i) {
+ consensusTopicNames.add(ReadWriteIOUtils.readString(byteBuffer));
+ }
+ }
}
@Override
@@ -364,7 +417,8 @@ public boolean equals(final Object o) {
&& getCycles() == that.getCycles()
&& Objects.equals(subscribeReq, that.subscribeReq)
&& Objects.equals(alterConsumerGroupProcedure, that.alterConsumerGroupProcedure)
- && Objects.equals(createPipeProcedures, that.createPipeProcedures);
+ && Objects.equals(createPipeProcedures, that.createPipeProcedures)
+ && Objects.equals(consensusTopicNames, that.consensusTopicNames);
}
@Override
@@ -375,7 +429,8 @@ public int hashCode() {
getCycles(),
subscribeReq,
alterConsumerGroupProcedure,
- createPipeProcedures);
+ createPipeProcedures,
+ consensusTopicNames);
}
@TestOnly
diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java
index 6741a6c1e2a84..99f8ed649d852 100644
--- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java
+++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java
@@ -22,6 +22,7 @@
import org.apache.iotdb.common.rpc.thrift.TSStatus;
import org.apache.iotdb.commons.pipe.agent.task.meta.PipeStaticMeta;
import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta;
+import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta;
import org.apache.iotdb.commons.utils.TestOnly;
import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan;
import org.apache.iotdb.confignode.consensus.request.write.pipe.task.DropPipePlanV2;
@@ -36,6 +37,7 @@
import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq;
import org.apache.iotdb.consensus.exception.ConsensusException;
import org.apache.iotdb.rpc.TSStatusCode;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
import org.apache.iotdb.rpc.subscription.exception.SubscriptionException;
import org.apache.tsfile.utils.ReadWriteIOUtils;
@@ -100,6 +102,31 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env)
for (final String topic : unsubscribeReq.getTopicNames()) {
if (topicsUnsubByGroup.contains(topic)) {
+ // Check if this topic uses consensus-based subscription (same detection as
+ // CreateSubscriptionProcedure). Consensus topics have no pipe to drop.
+ final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topic);
+ final String topicMode =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE);
+ final String topicFormat =
+ topicMeta
+ .getConfig()
+ .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE);
+ final boolean isConsensusBasedTopic =
+ TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode)
+ && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat);
+
+ if (isConsensusBasedTopic) {
+ LOGGER.info(
+ "DropSubscriptionProcedure: topic [{}] is consensus-based (mode={}, format={}), "
+ + "skipping pipe removal",
+ topic,
+ topicMode,
+ topicFormat);
+ continue;
+ }
+
// Topic will be subscribed by no consumers in this group
dropPipeProcedures.add(
new DropPipeProcedureV2(
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
index 959191ca2d6d3..c494ae05d01b0 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
@@ -98,6 +98,13 @@ public class IoTConsensus implements IConsensus {
private final IoTConsensusRPCService service;
private final RegisterManager registerManager = new RegisterManager();
private IoTConsensusConfig config;
+
+ /**
+ * Optional callback invoked after a new local peer is created via {@link #createLocalPeer}. Used
+ * by the subscription system to auto-bind prefetching queues to new DataRegions.
+ */
+ public static volatile BiConsumer onNewPeerCreated;
+
private final IClientManager clientManager;
private final IClientManager syncClientManager;
private final ScheduledExecutorService backgroundTaskService;
@@ -299,6 +306,16 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers)
if (exist.get()) {
throw new ConsensusGroupAlreadyExistException(groupId);
}
+
+ // Notify subscription system about new peer creation for auto-binding
+ final BiConsumer callback = onNewPeerCreated;
+ if (callback != null) {
+ try {
+ callback.accept(groupId, stateMachineMap.get(groupId));
+ } catch (final Exception e) {
+ logger.warn("onNewPeerCreated callback failed for group {}", groupId, e);
+ }
+ }
}
@Override
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
index 567261efffffa..bb5d4aa603417 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
@@ -89,13 +89,16 @@
import java.util.PriorityQueue;
import java.util.TreeSet;
import java.util.UUID;
+import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
+import java.util.function.LongSupplier;
import java.util.regex.Pattern;
import static org.apache.iotdb.commons.utils.FileUtils.humanReadableByteCountSI;
@@ -128,6 +131,14 @@ public class IoTConsensusServerImpl {
IoTConsensusRateLimiter.getInstance();
private IndexedConsensusRequest lastConsensusRequest;
+ // Subscription queues receive IndexedConsensusRequest in real-time from write(),
+ // similar to LogDispatcher, enabling in-memory data delivery without waiting for WAL flush.
+ private final List> subscriptionQueues =
+ new CopyOnWriteArrayList<>();
+ // Suppliers that report each subscription consumer's acknowledged search index.
+ // Used to pin WAL files: entries >= min(suppliers) cannot be deleted.
+ private final List subscriptionSyncIndexSuppliers = new CopyOnWriteArrayList<>();
+
public IoTConsensusServerImpl(
String storageDir,
Peer thisNode,
@@ -236,6 +247,44 @@ public TSStatus write(IConsensusRequest request) {
// in one transaction.
synchronized (searchIndex) {
logDispatcher.offer(indexedConsensusRequest);
+ // Deliver to subscription queues for real-time in-memory consumption.
+ // Offer AFTER stateMachine.write() so that InsertNode has inferred types
+ // and properly typed values (same timing as LogDispatcher).
+ final int sqCount = subscriptionQueues.size();
+ if (sqCount > 0) {
+ logger.debug(
+ "write() offering to {} subscription queue(s), "
+ + "group={}, searchIndex={}, requestType={}",
+ sqCount,
+ consensusGroupId,
+ indexedConsensusRequest.getSearchIndex(),
+ indexedConsensusRequest.getRequests().isEmpty()
+ ? "EMPTY"
+ : indexedConsensusRequest.getRequests().get(0).getClass().getSimpleName());
+ for (final BlockingQueue sq : subscriptionQueues) {
+ final boolean offered = sq.offer(indexedConsensusRequest);
+ logger.debug(
+ "offer result={}, queueSize={}, queueRemaining={}",
+ offered,
+ sq.size(),
+ sq.remainingCapacity());
+ if (!offered) {
+ logger.warn(
+ "Subscription queue full, dropped entry searchIndex={}",
+ indexedConsensusRequest.getSearchIndex());
+ }
+ }
+ } else {
+ // Log periodically when no subscription queues are registered
+ if (indexedConsensusRequest.getSearchIndex() % 50 == 0) {
+ logger.debug(
+ "write() no subscription queues registered, "
+ + "group={}, searchIndex={}, this={}",
+ consensusGroupId,
+ indexedConsensusRequest.getSearchIndex(),
+ System.identityHashCode(this));
+ }
+ }
searchIndex.incrementAndGet();
}
// statistic the time of offering request into queue
@@ -243,10 +292,13 @@ public TSStatus write(IConsensusRequest request) {
System.nanoTime() - writeToStateMachineEndTime);
} else {
logger.debug(
- "{}: write operation failed. searchIndex: {}. Code: {}",
+ "write operation FAILED. group={}, searchIndex={}, code={}, "
+ + "subscriptionQueues={}, this={}",
thisNode.getGroupId(),
indexedConsensusRequest.getSearchIndex(),
- result.getCode());
+ result.getCode(),
+ subscriptionQueues.size(),
+ System.identityHashCode(this));
}
// statistic the time of total write process
ioTConsensusServerMetrics.recordConsensusWriteTime(
@@ -757,6 +809,47 @@ public long getSearchIndex() {
return searchIndex.get();
}
+ public ConsensusReqReader getConsensusReqReader() {
+ return consensusReqReader;
+ }
+
+ /**
+ * Registers a subscription pending queue for real-time in-memory data delivery. When {@link
+ * #write(IConsensusRequest)} succeeds, the IndexedConsensusRequest is offered to all registered
+ * subscription queues, enabling subscription consumers to receive data without waiting for WAL
+ * flush.
+ *
+ * @param queue the blocking queue to receive IndexedConsensusRequest entries
+ * @param syncIndexSupplier supplies the subscription consumer's current acknowledged search
+ * index, used by WAL pinning to prevent deletion of unacknowledged entries
+ */
+ public void registerSubscriptionQueue(
+ final BlockingQueue queue, final LongSupplier syncIndexSupplier) {
+ subscriptionQueues.add(queue);
+ subscriptionSyncIndexSuppliers.add(syncIndexSupplier);
+ // Immediately re-evaluate the safe delete index to protect WAL for this subscriber
+ checkAndUpdateSafeDeletedSearchIndex();
+ logger.info(
+ "Registered subscription queue for group {}, "
+ + "total subscription queues: {}, currentSearchIndex={}, this={}",
+ consensusGroupId,
+ subscriptionQueues.size(),
+ searchIndex.get(),
+ System.identityHashCode(this));
+ }
+
+ public void unregisterSubscriptionQueue(
+ final BlockingQueue queue, final LongSupplier syncIndexSupplier) {
+ subscriptionQueues.remove(queue);
+ subscriptionSyncIndexSuppliers.remove(syncIndexSupplier);
+ // Re-evaluate: with fewer subscribers, more WAL may be deletable
+ checkAndUpdateSafeDeletedSearchIndex();
+ logger.info(
+ "Unregistered subscription queue for group {}, remaining subscription queues: {}",
+ consensusGroupId,
+ subscriptionQueues.size());
+ }
+
public long getSyncLag() {
long minSyncIndex = getMinSyncIndex();
return getSearchIndex() - minSyncIndex;
@@ -879,10 +972,25 @@ void checkAndUpdateSafeDeletedSearchIndex() {
if (configuration.isEmpty()) {
logger.error(
"Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time.");
- } else if (configuration.size() == 1) {
+ return;
+ }
+
+ // Compute the minimum search index that subscription consumers still need.
+ // WAL entries at or after this index must be preserved.
+ long minSubscriptionIndex = Long.MAX_VALUE;
+ for (final LongSupplier supplier : subscriptionSyncIndexSuppliers) {
+ minSubscriptionIndex = Math.min(minSubscriptionIndex, supplier.getAsLong());
+ }
+
+ if (configuration.size() == 1 && subscriptionSyncIndexSuppliers.isEmpty()) {
+ // Single replica, no subscription consumers => delete all WAL freely
consensusReqReader.setSafelyDeletedSearchIndex(Long.MAX_VALUE);
} else {
- consensusReqReader.setSafelyDeletedSearchIndex(getMinFlushedSyncIndex());
+ // min(replication progress, subscription progress) — preserve WAL for both
+ final long replicationIndex =
+ configuration.size() > 1 ? getMinFlushedSyncIndex() : Long.MAX_VALUE;
+ consensusReqReader.setSafelyDeletedSearchIndex(
+ Math.min(replicationIndex, minSubscriptionIndex));
}
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
index 510f8559bc147..220ad3e449951 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
@@ -19,7 +19,11 @@
package org.apache.iotdb.db.subscription.agent;
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.db.subscription.broker.ConsensusSubscriptionBroker;
import org.apache.iotdb.db.subscription.broker.SubscriptionBroker;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager;
import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager;
import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask;
@@ -30,6 +34,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -43,7 +49,12 @@ public class SubscriptionBrokerAgent {
private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBrokerAgent.class);
- private final Map consumerGroupIdToSubscriptionBroker =
+ /** Pipe-based subscription brokers, one per consumer group. */
+ private final Map consumerGroupIdToPipeBroker =
+ new ConcurrentHashMap<>();
+
+ /** Consensus-based subscription brokers, one per consumer group. */
+ private final Map consumerGroupIdToConsensusBroker =
new ConcurrentHashMap<>();
private final Cache prefetchingQueueCount =
@@ -54,17 +65,54 @@ public class SubscriptionBrokerAgent {
public List poll(
final ConsumerConfig consumerConfig, final Set topicNames, final long maxBytes) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String consumerId = consumerConfig.getConsumerId();
+ final List allEvents = new ArrayList<>();
+ long remainingBytes = maxBytes;
+
+ // Poll from pipe-based broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.nonNull(pipeBroker)) {
+ final List pipeEvents =
+ pipeBroker.poll(consumerId, topicNames, remainingBytes);
+ allEvents.addAll(pipeEvents);
+ for (final SubscriptionEvent event : pipeEvents) {
+ try {
+ remainingBytes -= event.getCurrentResponseSize();
+ } catch (final IOException ignored) {
+ // best effort
+ }
+ }
+ }
+
+ // Poll from consensus-based broker
+ if (remainingBytes > 0) {
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker)) {
+ LOGGER.debug(
+ "SubscriptionBrokerAgent: polling consensus broker for consumer group [{}], "
+ + "topicNames={}, remainingBytes={}",
+ consumerGroupId,
+ topicNames,
+ remainingBytes);
+ allEvents.addAll(consensusBroker.poll(consumerId, topicNames, remainingBytes));
+ } else {
+ LOGGER.debug(
+ "SubscriptionBrokerAgent: no consensus broker for consumer group [{}]",
+ consumerGroupId);
+ }
+ }
+
+ if (allEvents.isEmpty()
+ && Objects.isNull(pipeBroker)
+ && Objects.isNull(consumerGroupIdToConsensusBroker.get(consumerGroupId))) {
final String errorMessage =
- String.format(
- "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
+ String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
- // TODO: currently we fetch messages from all topics
- final String consumerId = consumerConfig.getConsumerId();
- return broker.poll(consumerId, topicNames, maxBytes);
+
+ return allEvents;
}
public List pollTsFile(
@@ -72,16 +120,18 @@ public List pollTsFile(
final SubscriptionCommitContext commitContext,
final long writingOffset) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // TsFile polling can only be called by pipe-based subscriptions
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
final String errorMessage =
String.format(
- "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
+ "Subscription: pipe broker bound to consumer group [%s] does not exist",
+ consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
final String consumerId = consumerConfig.getConsumerId();
- return broker.pollTsFile(consumerId, commitContext, writingOffset);
+ return pipeBroker.pollTsFile(consumerId, commitContext, writingOffset);
}
public List pollTablets(
@@ -89,16 +139,26 @@ public List pollTablets(
final SubscriptionCommitContext commitContext,
final int offset) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String consumerId = consumerConfig.getConsumerId();
+ final String topicName = commitContext.getTopicName();
+
+ // Try consensus-based broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.pollTablets(consumerId, commitContext, offset);
+ }
+
+ // Fall back to pipe-based broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
final String errorMessage =
String.format(
"Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
- final String consumerId = consumerConfig.getConsumerId();
- return broker.pollTablets(consumerId, commitContext, offset);
+ return pipeBroker.pollTablets(consumerId, commitContext, offset);
}
/**
@@ -109,46 +169,98 @@ public List commit(
final List commitContexts,
final boolean nack) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String consumerId = consumerConfig.getConsumerId();
+ final List allSuccessful = new ArrayList<>();
+
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+
+ if (Objects.isNull(pipeBroker) && Objects.isNull(consensusBroker)) {
final String errorMessage =
- String.format(
- "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId);
+ String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId);
LOGGER.warn(errorMessage);
throw new SubscriptionException(errorMessage);
}
- final String consumerId = consumerConfig.getConsumerId();
- return broker.commit(consumerId, commitContexts, nack);
+
+ // Partition commit contexts by which broker owns the topic.
+ final List pipeContexts = new ArrayList<>();
+ final List consensusContexts = new ArrayList<>();
+ for (final SubscriptionCommitContext ctx : commitContexts) {
+ final String topicName = ctx.getTopicName();
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ consensusContexts.add(ctx);
+ } else {
+ pipeContexts.add(ctx);
+ }
+ }
+
+ if (Objects.nonNull(pipeBroker) && !pipeContexts.isEmpty()) {
+ allSuccessful.addAll(pipeBroker.commit(consumerId, pipeContexts, nack));
+ }
+ if (Objects.nonNull(consensusBroker) && !consensusContexts.isEmpty()) {
+ allSuccessful.addAll(consensusBroker.commit(consumerId, consensusContexts, nack));
+ }
+
+ return allSuccessful;
}
public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
final String consumerGroupId = commitContext.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ final String topicName = commitContext.getTopicName();
+
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.isCommitContextOutdated(commitContext);
+ }
+
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
return true;
}
- return broker.isCommitContextOutdated(commitContext);
+ return pipeBroker.isCommitContextOutdated(commitContext);
}
public List fetchTopicNamesToUnsubscribe(
final ConsumerConfig consumerConfig, final Set topicNames) {
final String consumerGroupId = consumerConfig.getConsumerGroupId();
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+
+ // Consensus-based subscription topics are unbounded streams, so they do not trigger
+ // auto-unsubscribe.
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ final Set pipeOnlyTopicNames;
+ if (Objects.nonNull(consensusBroker)) {
+ pipeOnlyTopicNames = new java.util.HashSet<>(topicNames);
+ pipeOnlyTopicNames.removeIf(consensusBroker::hasQueue);
+ } else {
+ pipeOnlyTopicNames = topicNames;
+ }
+
+ if (pipeOnlyTopicNames.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
return Collections.emptyList();
}
- return broker.fetchTopicNamesToUnsubscribe(topicNames);
+ return pipeBroker.fetchTopicNamesToUnsubscribe(pipeOnlyTopicNames);
}
/////////////////////////////// broker ///////////////////////////////
public boolean isBrokerExist(final String consumerGroupId) {
- return consumerGroupIdToSubscriptionBroker.containsKey(consumerGroupId);
+ return consumerGroupIdToPipeBroker.containsKey(consumerGroupId)
+ || consumerGroupIdToConsensusBroker.containsKey(consumerGroupId);
}
public void createBrokerIfNotExist(final String consumerGroupId) {
- consumerGroupIdToSubscriptionBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new);
- LOGGER.info("Subscription: create broker bound to consumer group [{}]", consumerGroupId);
+ consumerGroupIdToPipeBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new);
+ LOGGER.info("Subscription: create pipe broker bound to consumer group [{}]", consumerGroupId);
}
/**
@@ -156,26 +268,46 @@ public void createBrokerIfNotExist(final String consumerGroupId) {
*/
public boolean dropBroker(final String consumerGroupId) {
final AtomicBoolean dropped = new AtomicBoolean(false);
- consumerGroupIdToSubscriptionBroker.compute(
+
+ // Drop pipe broker
+ consumerGroupIdToPipeBroker.compute(
consumerGroupId,
(id, broker) -> {
if (Objects.isNull(broker)) {
+ dropped.set(true);
+ return null;
+ }
+ if (!broker.isEmpty()) {
LOGGER.warn(
- "Subscription: broker bound to consumer group [{}] does not exist",
+ "Subscription: pipe broker bound to consumer group [{}] is not empty when dropping",
consumerGroupId);
- dropped.set(true);
+ return broker;
+ }
+ dropped.set(true);
+ LOGGER.info(
+ "Subscription: drop pipe broker bound to consumer group [{}]", consumerGroupId);
+ return null;
+ });
+
+ // Drop consensus broker
+ consumerGroupIdToConsensusBroker.compute(
+ consumerGroupId,
+ (id, broker) -> {
+ if (Objects.isNull(broker)) {
return null;
}
if (!broker.isEmpty()) {
LOGGER.warn(
- "Subscription: broker bound to consumer group [{}] is not empty when dropping",
+ "Subscription: consensus broker bound to consumer group [{}] is not empty when dropping",
consumerGroupId);
return broker;
}
dropped.set(true);
- LOGGER.info("Subscription: drop broker bound to consumer group [{}]", consumerGroupId);
- return null; // remove this entry
+ LOGGER.info(
+ "Subscription: drop consensus broker bound to consumer group [{}]", consumerGroupId);
+ return null;
});
+
return dropped.get();
}
@@ -183,15 +315,14 @@ public boolean dropBroker(final String consumerGroupId) {
public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) {
final String consumerGroupId = subtask.getConsumerGroupId();
- consumerGroupIdToSubscriptionBroker
+ consumerGroupIdToPipeBroker
.compute(
consumerGroupId,
(id, broker) -> {
if (Objects.isNull(broker)) {
LOGGER.info(
- "Subscription: broker bound to consumer group [{}] does not exist, create new for binding prefetching queue",
+ "Subscription: pipe broker bound to consumer group [{}] does not exist, create new for binding prefetching queue",
consumerGroupId);
- // TODO: consider more robust metadata semantics
return new SubscriptionBroker(consumerGroupId);
}
return broker;
@@ -200,41 +331,105 @@ public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) {
prefetchingQueueCount.invalidate();
}
- public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
+ public void bindConsensusPrefetchingQueue(
+ final String consumerGroupId,
+ final String topicName,
+ final String consensusGroupId,
+ final IoTConsensusServerImpl serverImpl,
+ final ConsensusLogToTabletConverter converter,
+ final ConsensusSubscriptionCommitManager commitManager,
+ final long startSearchIndex) {
+ consumerGroupIdToConsensusBroker
+ .compute(
+ consumerGroupId,
+ (id, broker) -> {
+ if (Objects.isNull(broker)) {
+ LOGGER.info(
+ "Subscription: consensus broker bound to consumer group [{}] does not exist, create new for binding consensus prefetching queue",
+ consumerGroupId);
+ return new ConsensusSubscriptionBroker(consumerGroupId);
+ }
+ return broker;
+ })
+ .bindConsensusPrefetchingQueue(
+ topicName, consensusGroupId, serverImpl, converter, commitManager, startSearchIndex);
+ prefetchingQueueCount.invalidate();
+ }
+
+ public void unbindConsensusPrefetchingQueue(
+ final String consumerGroupId, final String topicName) {
+ final ConsensusSubscriptionBroker broker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
if (Objects.isNull(broker)) {
LOGGER.warn(
- "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
+ "Subscription: consensus broker bound to consumer group [{}] does not exist",
+ consumerGroupId);
return;
}
- broker.updateCompletedTopicNames(topicName);
+ broker.unbindConsensusPrefetchingQueue(topicName);
+ prefetchingQueueCount.invalidate();
+ }
+
+ public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) {
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
+ LOGGER.warn(
+ "Subscription: pipe broker bound to consumer group [{}] does not exist", consumerGroupId);
+ return;
+ }
+ pipeBroker.updateCompletedTopicNames(topicName);
}
public void unbindPrefetchingQueue(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ consensusBroker.removeQueue(topicName);
+ prefetchingQueueCount.invalidate();
+ return;
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
LOGGER.warn(
"Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
return;
}
- broker.unbindPrefetchingQueue(topicName);
+ pipeBroker.unbindPrefetchingQueue(topicName);
prefetchingQueueCount.invalidate();
}
public void removePrefetchingQueue(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ consensusBroker.removeQueue(topicName);
+ prefetchingQueueCount.invalidate();
+ return;
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
LOGGER.warn(
"Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
return;
}
- broker.removePrefetchingQueue(topicName);
+ pipeBroker.removePrefetchingQueue(topicName);
prefetchingQueueCount.invalidate();
}
public boolean executePrefetch(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.executePrefetch(topicName);
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
SubscriptionDataNodeResourceManager.log()
.schedule(SubscriptionBrokerAgent.class, consumerGroupId, topicName)
.ifPresent(
@@ -244,17 +439,24 @@ public boolean executePrefetch(final String consumerGroupId, final String topicN
consumerGroupId));
return false;
}
- return broker.executePrefetch(topicName);
+ return pipeBroker.executePrefetch(topicName);
}
public int getPipeEventCount(final String consumerGroupId, final String topicName) {
- final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId);
- if (Objects.isNull(broker)) {
+ // Try consensus broker first
+ final ConsensusSubscriptionBroker consensusBroker =
+ consumerGroupIdToConsensusBroker.get(consumerGroupId);
+ if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ return consensusBroker.getEventCount(topicName);
+ }
+ // Fall back to pipe broker
+ final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
+ if (Objects.isNull(pipeBroker)) {
LOGGER.warn(
"Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId);
return 0;
}
- return broker.getPipeEventCount(topicName);
+ return pipeBroker.getPipeEventCount(topicName);
}
public int getPrefetchingQueueCount() {
@@ -262,9 +464,15 @@ public int getPrefetchingQueueCount() {
}
private int getPrefetchingQueueCountInternal() {
- return consumerGroupIdToSubscriptionBroker.values().stream()
- .map(SubscriptionBroker::getPrefetchingQueueCount)
- .reduce(0, Integer::sum);
+ int count =
+ consumerGroupIdToPipeBroker.values().stream()
+ .map(SubscriptionBroker::getPrefetchingQueueCount)
+ .reduce(0, Integer::sum);
+ count +=
+ consumerGroupIdToConsensusBroker.values().stream()
+ .map(ConsensusSubscriptionBroker::getQueueCount)
+ .reduce(0, Integer::sum);
+ return count;
}
/////////////////////////////// Cache ///////////////////////////////
@@ -272,14 +480,15 @@ private int getPrefetchingQueueCountInternal() {
/**
* A simple generic cache that computes and stores a value on demand.
*
- * Note that since the get() and invalidate() methods are not modified with synchronized, the
- * value obtained may not be entirely accurate.
+ *
Both {@code value} and {@code valid} are volatile to ensure visibility across threads. The
+ * {@code get()} method uses a local snapshot of {@code valid} to avoid double-read reordering.
+ * Concurrent recomputation by multiple threads is benign (idempotent supplier).
*
* @param the type of the cached value
*/
private static class Cache {
- private T value;
+ private volatile T value;
private volatile boolean valid = false;
private final Supplier supplier;
@@ -304,8 +513,10 @@ private void invalidate() {
*/
private T get() {
if (!valid) {
- value = supplier.get();
+ final T computed = supplier.get();
+ value = computed;
valid = true;
+ return computed;
}
return value;
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java
index fee23cf6af4cb..9c54497b6f468 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java
@@ -21,6 +21,7 @@
import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta;
import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler;
import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage;
import org.apache.iotdb.rpc.subscription.exception.SubscriptionException;
@@ -132,11 +133,34 @@ private void handleSingleConsumerGroupMetaChangesInternal(
for (final String topicName : topicsUnsubByGroup) {
SubscriptionAgent.broker().removePrefetchingQueue(consumerGroupId, topicName);
}
+ // Tear down consensus-based subscriptions for unsubscribed topics
+ if (!topicsUnsubByGroup.isEmpty()) {
+ ConsensusSubscriptionSetupHandler.teardownConsensusSubscriptions(
+ consumerGroupId, topicsUnsubByGroup);
+ }
+
+ // Detect newly subscribed topics (present in new meta but not in old meta)
+ final Set newlySubscribedTopics =
+ ConsumerGroupMeta.getTopicsNewlySubByGroup(metaInAgent, metaFromCoordinator);
+
+ LOGGER.info(
+ "Subscription: consumer group [{}] meta change detected, "
+ + "topicsUnsubByGroup={}, newlySubscribedTopics={}",
+ consumerGroupId,
+ topicsUnsubByGroup,
+ newlySubscribedTopics);
// TODO: Currently we fully replace the entire ConsumerGroupMeta without carefully checking the
// changes in its fields.
consumerGroupMetaKeeper.removeConsumerGroupMeta(consumerGroupId);
consumerGroupMetaKeeper.addConsumerGroupMeta(consumerGroupId, metaFromCoordinator);
+
+ // Set up consensus-based subscription for newly subscribed live-mode topics.
+ // This must happen after the meta is updated so that the broker can find the topic config.
+ if (!newlySubscribedTopics.isEmpty()) {
+ ConsensusSubscriptionSetupHandler.handleNewSubscriptions(
+ consumerGroupId, newlySubscribedTopics);
+ }
}
public TPushConsumerGroupMetaRespExceptionMessage handleConsumerGroupMetaChanges(
@@ -222,4 +246,24 @@ public Set getTopicNamesSubscribedByConsumer(
releaseReadLock();
}
}
+
+ /**
+ * Get all active subscriptions: consumerGroupId → set of subscribed topic names. Used by
+ * consensus subscription auto-binding when a new DataRegion is created.
+ */
+ public java.util.Map> getAllSubscriptions() {
+ acquireReadLock();
+ try {
+ final java.util.Map> result = new java.util.HashMap<>();
+ for (final ConsumerGroupMeta meta : consumerGroupMetaKeeper.getAllConsumerGroupMeta()) {
+ final Set topics = meta.getSubscribedTopicNames();
+ if (!topics.isEmpty()) {
+ result.put(meta.getConsumerGroupId(), new java.util.HashSet<>(topics));
+ }
+ }
+ return result;
+ } finally {
+ releaseReadLock();
+ }
+ }
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java
new file mode 100644
index 0000000000000..84d89ef9a8f39
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker;
+
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager;
+import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+/**
+ * Consensus-based subscription broker that reads data directly from IoTConsensus WAL. Each instance
+ * manages consensus prefetching queues for a single consumer group.
+ */
+public class ConsensusSubscriptionBroker implements ISubscriptionBroker {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusSubscriptionBroker.class);
+
+ private final String brokerId; // consumer group id
+
+ /** Maps topic name to a list of ConsensusPrefetchingQueues, one per data region. */
+ private final Map> topicNameToConsensusPrefetchingQueues;
+
+ /** Shared commit ID generators per topic. */
+ private final Map topicNameToCommitIdGenerator;
+
+ public ConsensusSubscriptionBroker(final String brokerId) {
+ this.brokerId = brokerId;
+ this.topicNameToConsensusPrefetchingQueues = new ConcurrentHashMap<>();
+ this.topicNameToCommitIdGenerator = new ConcurrentHashMap<>();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return topicNameToConsensusPrefetchingQueues.isEmpty();
+ }
+
+ @Override
+ public boolean hasQueue(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ return Objects.nonNull(queues)
+ && !queues.isEmpty()
+ && queues.stream().anyMatch(q -> !q.isClosed());
+ }
+
+ //////////////////////////// poll ////////////////////////////
+
+ @Override
+ public List poll(
+ final String consumerId, final Set topicNames, final long maxBytes) {
+ LOGGER.debug(
+ "ConsensusSubscriptionBroker [{}]: poll called, consumerId={}, topicNames={}, "
+ + "queueCount={}, maxBytes={}",
+ brokerId,
+ consumerId,
+ topicNames,
+ topicNameToConsensusPrefetchingQueues.size(),
+ maxBytes);
+
+ final List eventsToPoll = new ArrayList<>();
+ final List eventsToNack = new ArrayList<>();
+ long totalSize = 0;
+
+ for (final String topicName : topicNames) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ continue;
+ }
+
+ // Poll from all region queues for this topic
+ for (final ConsensusPrefetchingQueue consensusQueue : queues) {
+ if (consensusQueue.isClosed()) {
+ continue;
+ }
+
+ final SubscriptionEvent event = consensusQueue.poll(consumerId);
+ if (Objects.isNull(event)) {
+ continue;
+ }
+
+ final long currentSize;
+ try {
+ currentSize = event.getCurrentResponseSize();
+ } catch (final IOException e) {
+ eventsToNack.add(event);
+ continue;
+ }
+
+ eventsToPoll.add(event);
+ totalSize += currentSize;
+
+ if (totalSize + currentSize > maxBytes) {
+ break;
+ }
+ }
+
+ if (totalSize > maxBytes) {
+ break;
+ }
+ }
+
+ // Nack any events that had errors
+ if (!eventsToNack.isEmpty()) {
+ commit(
+ consumerId,
+ eventsToNack.stream()
+ .map(SubscriptionEvent::getCommitContext)
+ .collect(Collectors.toList()),
+ true);
+ }
+
+ LOGGER.debug(
+ "ConsensusSubscriptionBroker [{}]: poll result, consumerId={}, eventsPolled={}, eventsNacked={}",
+ brokerId,
+ consumerId,
+ eventsToPoll.size(),
+ eventsToNack.size());
+
+ return eventsToPoll;
+ }
+
+ @Override
+ public List pollTablets(
+ final String consumerId, final SubscriptionCommitContext commitContext, final int offset) {
+ final String topicName = commitContext.getTopicName();
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ // Try each region queue until one returns a match
+ for (final ConsensusPrefetchingQueue consensusQueue : queues) {
+ if (consensusQueue.isClosed()) {
+ continue;
+ }
+ final SubscriptionEvent event = consensusQueue.pollTablets(consumerId, commitContext, offset);
+ if (Objects.nonNull(event)) {
+ return Collections.singletonList(event);
+ }
+ }
+ return Collections.emptyList();
+ }
+
+ //////////////////////////// commit ////////////////////////////
+
+ @Override
+ public List commit(
+ final String consumerId,
+ final List commitContexts,
+ final boolean nack) {
+ final List successfulCommitContexts = new ArrayList<>();
+ for (final SubscriptionCommitContext commitContext : commitContexts) {
+ final String topicName = commitContext.getTopicName();
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ LOGGER.warn(
+ "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to commit",
+ brokerId,
+ topicName);
+ continue;
+ }
+
+ // Try each region queue for this topic (the event belongs to exactly one region).
+ // Don't warn per-queue miss — only warn if NO queue handled the commit.
+ boolean handled = false;
+ for (final ConsensusPrefetchingQueue consensusQueue : queues) {
+ if (consensusQueue.isClosed()) {
+ continue;
+ }
+ final boolean success;
+ if (!nack) {
+ success = consensusQueue.ackSilent(consumerId, commitContext);
+ } else {
+ success = consensusQueue.nackSilent(consumerId, commitContext);
+ }
+ if (success) {
+ successfulCommitContexts.add(commitContext);
+ handled = true;
+ break; // committed in the right queue, no need to try others
+ }
+ }
+ if (!handled) {
+ LOGGER.warn(
+ "ConsensusSubscriptionBroker [{}]: commit context {} not found in any of {} region queue(s) for topic [{}]",
+ brokerId,
+ commitContext,
+ queues.size(),
+ topicName);
+ }
+ }
+ return successfulCommitContexts;
+ }
+
+ @Override
+ public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
+ final String topicName = commitContext.getTopicName();
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ return true;
+ }
+ // Any queue that considers it NOT outdated means it's not outdated
+ for (final ConsensusPrefetchingQueue q : queues) {
+ if (!q.isCommitContextOutdated(commitContext)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ //////////////////////////// prefetching ////////////////////////////
+
+ @Override
+ public boolean executePrefetch(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ return false;
+ }
+ boolean anyPrefetched = false;
+ for (final ConsensusPrefetchingQueue q : queues) {
+ if (!q.isClosed() && q.executePrefetch()) {
+ anyPrefetched = true;
+ }
+ }
+ return anyPrefetched;
+ }
+
+ @Override
+ public int getEventCount(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues)) {
+ return 0;
+ }
+ return queues.stream().mapToInt(ConsensusPrefetchingQueue::getPrefetchedEventCount).sum();
+ }
+
+ @Override
+ public int getQueueCount() {
+ return topicNameToConsensusPrefetchingQueues.size();
+ }
+
+ //////////////////////////// queue management ////////////////////////////
+
+ public void bindConsensusPrefetchingQueue(
+ final String topicName,
+ final String consensusGroupId,
+ final IoTConsensusServerImpl serverImpl,
+ final ConsensusLogToTabletConverter converter,
+ final ConsensusSubscriptionCommitManager commitManager,
+ final long startSearchIndex) {
+ // Get or create the list of queues for this topic
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.computeIfAbsent(
+ topicName, k -> new CopyOnWriteArrayList<>());
+
+ // Check for duplicate region binding
+ for (final ConsensusPrefetchingQueue existing : queues) {
+ if (consensusGroupId.equals(existing.getConsensusGroupId()) && !existing.isClosed()) {
+ LOGGER.info(
+ "Subscription: consensus prefetching queue for topic [{}], region [{}] "
+ + "in consumer group [{}] already exists, skipping",
+ topicName,
+ consensusGroupId,
+ brokerId);
+ return;
+ }
+ }
+
+ // Get or create the shared commit ID generator for this topic
+ final AtomicLong sharedCommitIdGenerator =
+ topicNameToCommitIdGenerator.computeIfAbsent(topicName, k -> new AtomicLong(0));
+
+ final ConsensusPrefetchingQueue consensusQueue =
+ new ConsensusPrefetchingQueue(
+ brokerId,
+ topicName,
+ consensusGroupId,
+ serverImpl,
+ converter,
+ commitManager,
+ startSearchIndex,
+ sharedCommitIdGenerator);
+ queues.add(consensusQueue);
+ LOGGER.info(
+ "Subscription: create consensus prefetching queue bound to topic [{}] for consumer group [{}], "
+ + "consensusGroupId={}, startSearchIndex={}, totalRegionQueues={}",
+ topicName,
+ brokerId,
+ consensusGroupId,
+ startSearchIndex,
+ queues.size());
+ }
+
+ public void unbindConsensusPrefetchingQueue(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.isNull(queues) || queues.isEmpty()) {
+ LOGGER.warn(
+ "Subscription: consensus prefetching queues bound to topic [{}] for consumer group [{}] do not exist",
+ topicName,
+ brokerId);
+ return;
+ }
+
+ for (final ConsensusPrefetchingQueue q : queues) {
+ q.close();
+ }
+ topicNameToConsensusPrefetchingQueues.remove(topicName);
+ topicNameToCommitIdGenerator.remove(topicName);
+ LOGGER.info(
+ "Subscription: drop all {} consensus prefetching queue(s) bound to topic [{}] for consumer group [{}]",
+ queues.size(),
+ topicName,
+ brokerId);
+ }
+
+ @Override
+ public void removeQueue(final String topicName) {
+ final List queues =
+ topicNameToConsensusPrefetchingQueues.get(topicName);
+ if (Objects.nonNull(queues) && !queues.isEmpty()) {
+ LOGGER.info(
+ "Subscription: consensus prefetching queue(s) bound to topic [{}] for consumer group [{}] still exist, unbind before closing",
+ topicName,
+ brokerId);
+ unbindConsensusPrefetchingQueue(topicName);
+ }
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java
new file mode 100644
index 0000000000000..aaa88a5f84777
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker;
+
+import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext;
+
+import java.util.List;
+import java.util.Set;
+
+public interface ISubscriptionBroker {
+
+ List poll(String consumerId, Set topicNames, long maxBytes);
+
+ List pollTablets(
+ String consumerId, SubscriptionCommitContext commitContext, int offset);
+
+ List commit(
+ String consumerId, List commitContexts, boolean nack);
+
+ boolean isCommitContextOutdated(SubscriptionCommitContext commitContext);
+
+ boolean executePrefetch(String topicName);
+
+ int getEventCount(String topicName);
+
+ int getQueueCount();
+
+ void removeQueue(String topicName);
+
+ boolean isEmpty();
+
+ boolean hasQueue(String topicName);
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java
index cc03f7261419b..8f9d05324e905 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java
@@ -56,7 +56,7 @@
import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID;
-public class SubscriptionBroker {
+public class SubscriptionBroker implements ISubscriptionBroker {
private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBroker.class);
@@ -83,14 +83,23 @@ public SubscriptionBroker(final String brokerId) {
.build(consumerId -> new SubscriptionStates());
}
+ @Override
public boolean isEmpty() {
return topicNameToPrefetchingQueue.isEmpty()
&& completedTopicNames.isEmpty()
&& topicNameToCommitIdGenerator.isEmpty();
}
+ @Override
+ public boolean hasQueue(final String topicName) {
+ final SubscriptionPrefetchingQueue prefetchingQueue =
+ topicNameToPrefetchingQueue.get(topicName);
+ return Objects.nonNull(prefetchingQueue) && !prefetchingQueue.isClosed();
+ }
+
//////////////////////////// provided for SubscriptionBrokerAgent ////////////////////////////
+ @Override
public List poll(
final String consumerId, final Set topicNames, final long maxBytes) {
final List eventsToPoll = new ArrayList<>();
@@ -112,9 +121,10 @@ public List poll(
// Iterate over each sorted topic name and poll the corresponding events
int remainingTopicSize = sortedTopicNames.size();
for (final String topicName : sortedTopicNames) {
+ remainingTopicSize -= 1;
+ // Check pipe-based queue
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
- remainingTopicSize -= 1;
// Recheck
if (Objects.isNull(prefetchingQueue) || prefetchingQueue.isClosed()) {
@@ -182,6 +192,7 @@ private Set prepareCandidateTopicNames(
final List eventsToPoll /* output parameter */) {
final Set candidateTopicNames = new HashSet<>();
for (final String topicName : topicNames) {
+ // Check pipe-based queue
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
// If there is no prefetching queue for the topic, check if it's completed
@@ -271,6 +282,7 @@ public List pollTsFile(
return Collections.emptyList();
}
+ @Override
public List pollTablets(
final String consumerId, final SubscriptionCommitContext commitContext, final int offset) {
final String topicName = commitContext.getTopicName();
@@ -312,6 +324,7 @@ public List pollTablets(
/**
* @return list of successful commit contexts
*/
+ @Override
public List commit(
final String consumerId,
final List commitContexts,
@@ -348,6 +361,7 @@ public List commit(
return successfulCommitContexts;
}
+ @Override
public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
final String topicName = commitContext.getTopicName();
final SubscriptionPrefetchingQueue prefetchingQueue =
@@ -457,6 +471,11 @@ public void unbindPrefetchingQueue(final String topicName) {
brokerId);
}
+ @Override
+ public void removeQueue(final String topicName) {
+ removePrefetchingQueue(topicName);
+ }
+
public void removePrefetchingQueue(final String topicName) {
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
@@ -473,6 +492,7 @@ public void removePrefetchingQueue(final String topicName) {
topicNameToCommitIdGenerator.remove(topicName);
}
+ @Override
public boolean executePrefetch(final String topicName) {
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
@@ -505,6 +525,11 @@ public boolean executePrefetch(final String topicName) {
: prefetchingQueue.executePrefetchV2();
}
+ @Override
+ public int getEventCount(final String topicName) {
+ return getPipeEventCount(topicName);
+ }
+
public int getPipeEventCount(final String topicName) {
final SubscriptionPrefetchingQueue prefetchingQueue =
topicNameToPrefetchingQueue.get(topicName);
@@ -525,6 +550,11 @@ public int getPipeEventCount(final String topicName) {
return prefetchingQueue.getPipeEventCount();
}
+ @Override
+ public int getQueueCount() {
+ return getPrefetchingQueueCount();
+ }
+
public int getPrefetchingQueueCount() {
return topicNameToPrefetchingQueue.size();
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java
new file mode 100644
index 0000000000000..fbde6cee8c2fe
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java
@@ -0,0 +1,487 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowsNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertTabletNode;
+
+import org.apache.tsfile.enums.TSDataType;
+import org.apache.tsfile.file.metadata.IDeviceID;
+import org.apache.tsfile.utils.Binary;
+import org.apache.tsfile.utils.BitMap;
+import org.apache.tsfile.write.record.Tablet;
+import org.apache.tsfile.write.schema.IMeasurementSchema;
+import org.apache.tsfile.write.schema.MeasurementSchema;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** Converts IoTConsensus WAL log entries (InsertNode) to Tablet format for subscription. */
+public class ConsensusLogToTabletConverter {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusLogToTabletConverter.class);
+
+ private final TreePattern treePattern;
+ private final TablePattern tablePattern;
+
+ /**
+ * The actual database name of the DataRegion this converter processes (table-model format without
+ * "root." prefix). Null for tree-model topics.
+ */
+ private final String databaseName;
+
+ public ConsensusLogToTabletConverter(
+ final TreePattern treePattern, final TablePattern tablePattern, final String databaseName) {
+ this.treePattern = treePattern;
+ this.tablePattern = tablePattern;
+ this.databaseName = databaseName;
+ }
+
+ public String getDatabaseName() {
+ return databaseName;
+ }
+
+ static String safeDeviceIdForLog(final InsertNode node) {
+ try {
+ final Object deviceId = node.getDeviceID();
+ return deviceId != null ? deviceId.toString() : "null";
+ } catch (final Exception e) {
+ return "N/A(" + node.getType() + ")";
+ }
+ }
+
+ public List convert(final InsertNode insertNode) {
+ if (Objects.isNull(insertNode)) {
+ return Collections.emptyList();
+ }
+
+ final PlanNodeType nodeType = insertNode.getType();
+ if (nodeType == null) {
+ LOGGER.warn("InsertNode type is null, skipping conversion");
+ return Collections.emptyList();
+ }
+
+ LOGGER.debug(
+ "ConsensusLogToTabletConverter: converting InsertNode type={}, deviceId={}",
+ nodeType,
+ safeDeviceIdForLog(insertNode));
+
+ switch (nodeType) {
+ case INSERT_ROW:
+ return convertInsertRowNode((InsertRowNode) insertNode);
+ case INSERT_TABLET:
+ return convertInsertTabletNode((InsertTabletNode) insertNode);
+ case INSERT_ROWS:
+ return convertInsertRowsNode((InsertRowsNode) insertNode);
+ case INSERT_ROWS_OF_ONE_DEVICE:
+ return convertInsertRowsOfOneDeviceNode((InsertRowsOfOneDeviceNode) insertNode);
+ case INSERT_MULTI_TABLET:
+ return convertInsertMultiTabletsNode((InsertMultiTabletsNode) insertNode);
+ case RELATIONAL_INSERT_ROW:
+ return convertRelationalInsertRowNode((RelationalInsertRowNode) insertNode);
+ case RELATIONAL_INSERT_TABLET:
+ return convertRelationalInsertTabletNode((RelationalInsertTabletNode) insertNode);
+ case RELATIONAL_INSERT_ROWS:
+ return convertRelationalInsertRowsNode((RelationalInsertRowsNode) insertNode);
+ default:
+ LOGGER.debug("Unsupported InsertNode type for subscription: {}", nodeType);
+ return Collections.emptyList();
+ }
+ }
+
+ // ======================== Tree Model Conversion ========================
+
+ private List convertInsertRowNode(final InsertRowNode node) {
+ final IDeviceID deviceId = node.getDeviceID();
+
+ // Device-level path filtering
+ if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) {
+ return Collections.emptyList();
+ }
+
+ final long time = node.getTime();
+
+ // Determine which columns match the pattern
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final Object[] values = node.getValues();
+ final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements);
+
+ if (matchedColumnIndices.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ // Build Tablet with matched columns
+ final int columnCount = matchedColumnIndices.size();
+ final List schemas = new ArrayList<>(columnCount);
+ for (final int colIdx : matchedColumnIndices) {
+ schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx]));
+ }
+
+ final Tablet tablet = new Tablet(deviceId.toString(), schemas, 1 /* maxRowNumber */);
+ tablet.addTimestamp(0, time);
+
+ for (int i = 0; i < columnCount; i++) {
+ final int originalColIdx = matchedColumnIndices.get(i);
+ final Object value = values[originalColIdx];
+ if (value == null) {
+ if (tablet.getBitMaps() == null) {
+ tablet.initBitMaps();
+ }
+ tablet.getBitMaps()[i].mark(0);
+ } else {
+ addValueToTablet(tablet, 0, i, dataTypes[originalColIdx], value);
+ }
+ }
+ tablet.setRowSize(1);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertInsertTabletNode(final InsertTabletNode node) {
+ final IDeviceID deviceId = node.getDeviceID();
+
+ // Device-level path filtering
+ if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) {
+ return Collections.emptyList();
+ }
+
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final long[] times = node.getTimes();
+ final Object[] columns = node.getColumns();
+ final BitMap[] bitMaps = node.getBitMaps();
+ final int rowCount = node.getRowCount();
+
+ // Column filtering
+ final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements);
+ if (matchedColumnIndices.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ // Build Tablet with all rows
+ final int columnCount = matchedColumnIndices.size();
+ final List schemas = new ArrayList<>(columnCount);
+ for (final int colIdx : matchedColumnIndices) {
+ schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx]));
+ }
+
+ final Tablet tablet = new Tablet(deviceId.toString(), schemas, rowCount);
+
+ for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) {
+ tablet.addTimestamp(rowIdx, times[rowIdx]);
+
+ for (int colIdx = 0; colIdx < columnCount; colIdx++) {
+ final int originalColIdx = matchedColumnIndices.get(colIdx);
+ final boolean isNull =
+ (bitMaps != null
+ && bitMaps[originalColIdx] != null
+ && bitMaps[originalColIdx].isMarked(rowIdx));
+
+ if (isNull) {
+ if (tablet.getBitMaps() == null) {
+ tablet.initBitMaps();
+ }
+ tablet.getBitMaps()[colIdx].mark(rowIdx);
+ } else {
+ copyColumnValue(
+ tablet, rowIdx, colIdx, dataTypes[originalColIdx], columns[originalColIdx], rowIdx);
+ }
+ }
+ }
+ tablet.setRowSize(rowCount);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertInsertRowsNode(final InsertRowsNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertRowNode rowNode : node.getInsertRowNodeList()) {
+ // Handle merge bug: RelationalInsertRowNode.mergeInsertNode() is not overridden,
+ // so merged relational nodes arrive as InsertRowsNode (tree) with RelationalInsertRowNode
+ // children. Dispatch correctly by checking the actual child type.
+ if (rowNode instanceof RelationalInsertRowNode) {
+ tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode));
+ } else {
+ tablets.addAll(convertInsertRowNode(rowNode));
+ }
+ }
+ return tablets;
+ }
+
+ private List convertInsertRowsOfOneDeviceNode(final InsertRowsOfOneDeviceNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertRowNode rowNode : node.getInsertRowNodeList()) {
+ tablets.addAll(convertInsertRowNode(rowNode));
+ }
+ return tablets;
+ }
+
+ private List convertInsertMultiTabletsNode(final InsertMultiTabletsNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertTabletNode tabletNode : node.getInsertTabletNodeList()) {
+ tablets.addAll(convertInsertTabletNode(tabletNode));
+ }
+ return tablets;
+ }
+
+ // ======================== Table Model Conversion ========================
+
+ private List convertRelationalInsertRowNode(final RelationalInsertRowNode node) {
+ final String tableName = node.getTableName();
+
+ // Table-level pattern filtering
+ if (tablePattern != null) {
+ if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) {
+ return Collections.emptyList();
+ }
+ if (tableName != null && !tablePattern.matchesTable(tableName)) {
+ return Collections.emptyList();
+ }
+ }
+
+ final long time = node.getTime();
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final Object[] values = node.getValues();
+
+ final int columnCount = measurements.length;
+ final List schemas = new ArrayList<>(columnCount);
+ for (int i = 0; i < columnCount; i++) {
+ schemas.add(new MeasurementSchema(measurements[i], dataTypes[i]));
+ }
+
+ final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, 1);
+ tablet.addTimestamp(0, time);
+
+ for (int i = 0; i < columnCount; i++) {
+ final Object value = values[i];
+ if (value == null) {
+ if (tablet.getBitMaps() == null) {
+ tablet.initBitMaps();
+ }
+ tablet.getBitMaps()[i].mark(0);
+ } else {
+ addValueToTablet(tablet, 0, i, dataTypes[i], value);
+ }
+ }
+ tablet.setRowSize(1);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertRelationalInsertTabletNode(final RelationalInsertTabletNode node) {
+ final String tableName = node.getTableName();
+
+ // Table-level pattern filtering
+ if (tablePattern != null) {
+ if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) {
+ return Collections.emptyList();
+ }
+ if (tableName != null && !tablePattern.matchesTable(tableName)) {
+ return Collections.emptyList();
+ }
+ }
+
+ final String[] measurements = node.getMeasurements();
+ final TSDataType[] dataTypes = node.getDataTypes();
+ final long[] times = node.getTimes();
+ final Object[] columns = node.getColumns();
+ final BitMap[] bitMaps = node.getBitMaps();
+ final int rowCount = node.getRowCount();
+
+ final int columnCount = measurements.length;
+ final List schemas = new ArrayList<>(columnCount);
+ for (int i = 0; i < columnCount; i++) {
+ schemas.add(new MeasurementSchema(measurements[i], dataTypes[i]));
+ }
+
+ final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, rowCount);
+
+ for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) {
+ tablet.addTimestamp(rowIdx, times[rowIdx]);
+
+ for (int colIdx = 0; colIdx < columnCount; colIdx++) {
+ final boolean isNull =
+ (bitMaps != null && bitMaps[colIdx] != null && bitMaps[colIdx].isMarked(rowIdx));
+
+ if (isNull) {
+ if (tablet.getBitMaps() == null) {
+ tablet.initBitMaps();
+ }
+ tablet.getBitMaps()[colIdx].mark(rowIdx);
+ } else {
+ copyColumnValue(tablet, rowIdx, colIdx, dataTypes[colIdx], columns[colIdx], rowIdx);
+ }
+ }
+ }
+ tablet.setRowSize(rowCount);
+
+ return Collections.singletonList(tablet);
+ }
+
+ private List convertRelationalInsertRowsNode(final RelationalInsertRowsNode node) {
+ final List tablets = new ArrayList<>();
+ for (final InsertRowNode rowNode : node.getInsertRowNodeList()) {
+ tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode));
+ }
+ return tablets;
+ }
+
+ // ======================== Helper Methods ========================
+
+ /**
+ * Returns indices of columns that match the tree pattern. If no tree pattern is specified, all
+ * column indices are returned.
+ */
+ private List getMatchedTreeColumnIndices(
+ final IDeviceID deviceId, final String[] measurements) {
+ if (treePattern == null || treePattern.isRoot() || treePattern.coversDevice(deviceId)) {
+ // All columns match
+ final List allIndices = new ArrayList<>(measurements.length);
+ for (int i = 0; i < measurements.length; i++) {
+ if (measurements[i] != null) {
+ allIndices.add(i);
+ }
+ }
+ return allIndices;
+ }
+
+ final List matchedIndices = new ArrayList<>();
+ for (int i = 0; i < measurements.length; i++) {
+ if (measurements[i] != null && treePattern.matchesMeasurement(deviceId, measurements[i])) {
+ matchedIndices.add(i);
+ }
+ }
+ return matchedIndices;
+ }
+
+ /**
+ * Adds a single value to the tablet at the specified position.
+ *
+ * IMPORTANT: In tsfile-2.2.1, Tablet.addTimestamp() calls initBitMapsWithApiUsage() which
+ * creates bitMaps and marks ALL positions as null via markAll(). Since we write values directly
+ * to the underlying typed arrays (bypassing the Tablet.addValue() API which would call
+ * updateBitMap to unmark), we must explicitly unmark the bitmap position to indicate the value is
+ * NOT null.
+ */
+ private void addValueToTablet(
+ final Tablet tablet,
+ final int rowIndex,
+ final int columnIndex,
+ final TSDataType dataType,
+ final Object value) {
+ switch (dataType) {
+ case BOOLEAN:
+ ((boolean[]) tablet.getValues()[columnIndex])[rowIndex] = (boolean) value;
+ break;
+ case INT32:
+ case DATE:
+ ((int[]) tablet.getValues()[columnIndex])[rowIndex] = (int) value;
+ break;
+ case INT64:
+ case TIMESTAMP:
+ ((long[]) tablet.getValues()[columnIndex])[rowIndex] = (long) value;
+ break;
+ case FLOAT:
+ ((float[]) tablet.getValues()[columnIndex])[rowIndex] = (float) value;
+ break;
+ case DOUBLE:
+ ((double[]) tablet.getValues()[columnIndex])[rowIndex] = (double) value;
+ break;
+ case TEXT:
+ case BLOB:
+ case STRING:
+ ((Binary[]) tablet.getValues()[columnIndex])[rowIndex] = (Binary) value;
+ break;
+ default:
+ LOGGER.warn("Unsupported data type: {}", dataType);
+ return;
+ }
+ // Unmark the bitmap position to indicate this value is NOT null.
+ // addTimestamp() triggers initBitMapsWithApiUsage() which marks all positions as null.
+ final BitMap[] bitMaps = tablet.getBitMaps();
+ if (bitMaps != null && bitMaps[columnIndex] != null) {
+ bitMaps[columnIndex].unmark(rowIndex);
+ }
+ }
+
+ /** Copies a single column value from the source column array to the tablet. */
+ private void copyColumnValue(
+ final Tablet tablet,
+ final int targetRowIndex,
+ final int targetColumnIndex,
+ final TSDataType dataType,
+ final Object sourceColumn,
+ final int sourceRowIndex) {
+ switch (dataType) {
+ case BOOLEAN:
+ ((boolean[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((boolean[]) sourceColumn)[sourceRowIndex];
+ break;
+ case INT32:
+ case DATE:
+ ((int[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((int[]) sourceColumn)[sourceRowIndex];
+ break;
+ case INT64:
+ case TIMESTAMP:
+ ((long[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((long[]) sourceColumn)[sourceRowIndex];
+ break;
+ case FLOAT:
+ ((float[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((float[]) sourceColumn)[sourceRowIndex];
+ break;
+ case DOUBLE:
+ ((double[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((double[]) sourceColumn)[sourceRowIndex];
+ break;
+ case TEXT:
+ case BLOB:
+ case STRING:
+ ((Binary[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] =
+ ((Binary[]) sourceColumn)[sourceRowIndex];
+ break;
+ default:
+ LOGGER.warn("Unsupported data type for copy: {}", dataType);
+ return;
+ }
+ // Unmark the bitmap position to indicate this value is NOT null.
+ final BitMap[] bitMaps = tablet.getBitMaps();
+ if (bitMaps != null && bitMaps[targetColumnIndex] != null) {
+ bitMaps[targetColumnIndex].unmark(targetRowIndex);
+ }
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java
new file mode 100644
index 0000000000000..28743d1aae73c
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java
@@ -0,0 +1,1179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.commons.subscription.config.SubscriptionConfig;
+import org.apache.iotdb.consensus.common.request.IConsensusRequest;
+import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest;
+import org.apache.iotdb.consensus.common.request.IoTConsensusRequest;
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.consensus.iot.log.ConsensusReqReader;
+import org.apache.iotdb.db.conf.IoTDBDescriptor;
+import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode;
+import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode;
+import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry;
+import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
+import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext;
+import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType;
+import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload;
+
+import org.apache.tsfile.utils.Pair;
+import org.apache.tsfile.write.record.Tablet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.PriorityBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.function.LongSupplier;
+
+import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID;
+
+/**
+ * A prefetching queue that reads data from IoTConsensus using a hybrid approach:
+ *
+ *
+ * In-memory pending queue : Registered with {@link IoTConsensusServerImpl}, receives
+ * {@link IndexedConsensusRequest} in real-time from the write path (same mechanism as
+ * LogDispatcher). This avoids waiting for WAL flush to disk.
+ * WAL fallback : Uses {@link ConsensusReqReader.ReqIterator} to read from WAL files for
+ * gap-filling (pending queue overflow) or catch-up scenarios.
+ * WAL pinning : Supplies the earliest outstanding (uncommitted) search index to {@link
+ * IoTConsensusServerImpl}, preventing WAL deletion of entries not yet consumed by the
+ * subscription.
+ *
+ *
+ * A background prefetch thread continuously drains the pending queue, converts InsertNode
+ * entries to Tablets via {@link ConsensusLogToTabletConverter}, and enqueues {@link
+ * SubscriptionEvent} objects into the prefetchingQueue for consumer polling.
+ *
+ *
This design mirrors LogDispatcher's dual-path (pendingEntries + WAL reader) but targets
+ * subscription delivery instead of replication.
+ *
+ *
Thread safety: Uses a fair {@link ReentrantReadWriteLock} to ensure mutual exclusion between
+ * cleanup and other operations (poll, ack, nack), consistent with the existing prefetching queue
+ * design.
+ */
+public class ConsensusPrefetchingQueue {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchingQueue.class);
+
+ private final String brokerId; // consumer group id
+ private final String topicName;
+ private final String consensusGroupId;
+
+ private final IoTConsensusServerImpl serverImpl;
+
+ private final ConsensusReqReader consensusReqReader;
+
+ private volatile ConsensusReqReader.ReqIterator reqIterator;
+
+ /**
+ * In-memory pending queue registered with {@link IoTConsensusServerImpl#write}. Receives
+ * IndexedConsensusRequest in real-time without waiting for WAL flush. Capacity is bounded to
+ * apply back-pressure; overflows are filled from WAL.
+ */
+ private final BlockingQueue pendingEntries;
+
+ private static final int PENDING_QUEUE_CAPACITY = 4096;
+
+ private final ConsensusLogToTabletConverter converter;
+
+ private final ConsensusSubscriptionCommitManager commitManager;
+
+ /**
+ * Cached LongSupplier instance for WAL pinning registration. Must be the SAME object reference
+ * for both registerSubscriptionQueue and unregisterSubscriptionQueue, because
+ * CopyOnWriteArrayList.remove() uses equals() which defaults to reference equality for lambdas.
+ * Using this::method would create a new lambda instance each time, causing remove() to fail and
+ * WAL to be pinned indefinitely.
+ */
+ private final LongSupplier walPinSupplier;
+
+ /** Commit ID generator, monotonically increasing within this queue's lifetime. */
+ private final AtomicLong commitIdGenerator;
+
+ /** Records the initial commit ID for outdated event detection. */
+ private final long initialCommitId;
+
+ private final AtomicLong nextExpectedSearchIndex;
+
+ private final PriorityBlockingQueue prefetchingQueue;
+
+ /**
+ * Tracks in-flight events that have been polled but not yet committed. Key: (consumerId,
+ * commitContext) -> event.
+ */
+ private final Map, SubscriptionEvent> inFlightEvents;
+
+ /**
+ * Tracks outstanding (uncommitted) events for WAL pinning. Maps commitId to the startSearchIndex
+ * of that event batch. The earliest entry's value is supplied to IoTConsensusServerImpl to pin
+ * WAL files from deletion.
+ */
+ private final ConcurrentSkipListMap outstandingCommitIdToStartIndex;
+
+ private static final int MAX_TABLETS_PER_EVENT = 64;
+
+ private static final int MAX_WAL_ENTRIES_PER_PREFETCH = 128;
+
+ private static final int MAX_PREFETCHING_QUEUE_SIZE = 256;
+
+ private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true);
+
+ private volatile boolean isClosed = false;
+
+ /**
+ * Background thread that drains pendingEntries and fills prefetchingQueue. TODO: manage thread
+ * count
+ */
+ private final Thread prefetchThread;
+
+ public ConsensusPrefetchingQueue(
+ final String brokerId,
+ final String topicName,
+ final String consensusGroupId,
+ final IoTConsensusServerImpl serverImpl,
+ final ConsensusLogToTabletConverter converter,
+ final ConsensusSubscriptionCommitManager commitManager,
+ final long startSearchIndex,
+ final AtomicLong sharedCommitIdGenerator) {
+ this.brokerId = brokerId;
+ this.topicName = topicName;
+ this.consensusGroupId = consensusGroupId;
+ this.serverImpl = serverImpl;
+ this.consensusReqReader = serverImpl.getConsensusReqReader();
+ this.converter = converter;
+ this.commitManager = commitManager;
+
+ this.commitIdGenerator = sharedCommitIdGenerator;
+ this.initialCommitId = commitIdGenerator.get();
+ this.nextExpectedSearchIndex = new AtomicLong(startSearchIndex);
+ this.reqIterator = consensusReqReader.getReqIterator(startSearchIndex);
+
+ this.prefetchingQueue = new PriorityBlockingQueue<>();
+ this.inFlightEvents = new ConcurrentHashMap<>();
+ this.outstandingCommitIdToStartIndex = new ConcurrentSkipListMap<>();
+
+ // Create and register the in-memory pending queue with IoTConsensusServerImpl.
+ // IMPORTANT: walPinSupplier is stored as a field (not a method reference) to ensure the
+ // same object reference is used for both register and unregister.
+ this.pendingEntries = new ArrayBlockingQueue<>(PENDING_QUEUE_CAPACITY);
+ this.walPinSupplier = this::getEarliestOutstandingSearchIndex;
+ serverImpl.registerSubscriptionQueue(pendingEntries, walPinSupplier);
+
+ // Start background prefetch thread
+ this.prefetchThread =
+ new Thread(this::prefetchLoop, "ConsensusPrefetch-" + brokerId + "-" + topicName);
+ this.prefetchThread.setDaemon(true);
+ this.prefetchThread.start();
+
+ LOGGER.info(
+ "ConsensusPrefetchingQueue created: brokerId={}, topicName={}, consensusGroupId={}, "
+ + "startSearchIndex={}",
+ brokerId,
+ topicName,
+ consensusGroupId,
+ startSearchIndex);
+ }
+
+ /**
+ * Returns the earliest outstanding (uncommitted) search index for WAL pinning. If there are no
+ * outstanding events, returns the next expected search index (nothing to pin beyond what we've
+ * already processed).
+ */
+ private long getEarliestOutstandingSearchIndex() {
+ final Map.Entry first = outstandingCommitIdToStartIndex.firstEntry();
+ if (first != null) {
+ return first.getValue();
+ }
+ return nextExpectedSearchIndex.get();
+ }
+
+ // ======================== Lock Operations ========================
+
+ private void acquireReadLock() {
+ lock.readLock().lock();
+ }
+
+ private void releaseReadLock() {
+ lock.readLock().unlock();
+ }
+
+ private void acquireWriteLock() {
+ lock.writeLock().lock();
+ }
+
+ private void releaseWriteLock() {
+ lock.writeLock().unlock();
+ }
+
+ // ======================== Poll ========================
+
+ public SubscriptionEvent poll(final String consumerId) {
+ acquireReadLock();
+ try {
+ return isClosed ? null : pollInternal(consumerId);
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private SubscriptionEvent pollInternal(final String consumerId) {
+ // Recycle any uncommitted in-flight events for this consumer before serving new data.
+ final int recycled = recycleInFlightEventsForConsumer(consumerId);
+ if (recycled > 0) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: recycled {} uncommitted in-flight events for "
+ + "consumer {} back to prefetching queue",
+ this,
+ recycled,
+ consumerId);
+ }
+
+ final long size = prefetchingQueue.size();
+ if (size == 0) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: prefetching queue is empty for consumerId={}, "
+ + "pendingEntriesSize={}, nextExpected={}, isClosed={}, threadAlive={}",
+ this,
+ consumerId,
+ pendingEntries.size(),
+ nextExpectedSearchIndex.get(),
+ isClosed,
+ prefetchThread.isAlive());
+ return null;
+ }
+
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: polling, queue size={}, consumerId={}",
+ this,
+ size,
+ consumerId);
+ long count = 0;
+
+ SubscriptionEvent event;
+ try {
+ while (count++ < size
+ && Objects.nonNull(
+ event =
+ prefetchingQueue.poll(
+ SubscriptionConfig.getInstance().getSubscriptionPollMaxBlockingTimeMs(),
+ TimeUnit.MILLISECONDS))) {
+ if (event.isCommitted()) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {} poll committed event {} (broken invariant), remove it",
+ this,
+ event);
+ continue;
+ }
+
+ if (!event.pollable()) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {} poll non-pollable event {} (broken invariant), nack it",
+ this,
+ event);
+ event.nack();
+ continue;
+ }
+
+ // Mark as polled before updating inFlightEvents
+ event.recordLastPolledTimestamp();
+ inFlightEvents.put(new Pair<>(consumerId, event.getCommitContext()), event);
+ event.recordLastPolledConsumerId(consumerId);
+ return event;
+ }
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ LOGGER.warn("ConsensusPrefetchingQueue {} interrupted while polling", this, e);
+ }
+
+ return null;
+ }
+
+ public SubscriptionEvent pollTablets(
+ final String consumerId, final SubscriptionCommitContext commitContext, final int offset) {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return null;
+ }
+ final SubscriptionEvent event = inFlightEvents.get(new Pair<>(consumerId, commitContext));
+ if (Objects.isNull(event)) {
+ if (isCommitContextOutdated(commitContext)) {
+ return generateOutdatedErrorResponse();
+ }
+ return generateErrorResponse(
+ String.format(
+ "ConsensusPrefetchingQueue %s: no in-flight event for consumer %s, commit context %s",
+ this, consumerId, commitContext));
+ }
+ return event;
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ // ======================== Background Prefetch ========================
+
+ public boolean executePrefetch() {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return false;
+ }
+ // Recycle pollable events from inFlightEvents back to prefetchingQueue
+ recycleInFlightEvents();
+ return !prefetchingQueue.isEmpty();
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private static final long PENDING_DRAIN_TIMEOUT_MS = 200;
+
+ private static final long WAL_WAIT_TIMEOUT_SECONDS = 2;
+
+ /**
+ * Background prefetch loop. Continuously drains from pendingEntries (in-memory, real-time),
+ * detects gaps and fills from WAL reader, converts to Tablets, and enqueues SubscriptionEvents.
+ */
+ private void prefetchLoop() {
+ LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread started", this);
+ try {
+ while (!isClosed && !Thread.currentThread().isInterrupted()) {
+ try {
+ // Back-pressure: wait if prefetchingQueue is full
+ if (prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) {
+ Thread.sleep(50);
+ continue;
+ }
+
+ // Try to drain from pending entries (in-memory, fast path)
+ final List batch = new ArrayList<>();
+ // Block briefly for first entry
+ final IndexedConsensusRequest first =
+ pendingEntries.poll(PENDING_DRAIN_TIMEOUT_MS, TimeUnit.MILLISECONDS);
+ if (first != null) {
+ batch.add(first);
+ // Drain more non-blocking
+ int drained = 0;
+ IndexedConsensusRequest next;
+ while (drained < MAX_WAL_ENTRIES_PER_PREFETCH - 1
+ && (next = pendingEntries.poll()) != null) {
+ batch.add(next);
+ drained++;
+ }
+ }
+
+ if (!batch.isEmpty()) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: drained {} entries from pendingEntries, "
+ + "first searchIndex={}, last searchIndex={}, nextExpected={}, "
+ + "prefetchingQueueSize={}",
+ this,
+ batch.size(),
+ batch.get(0).getSearchIndex(),
+ batch.get(batch.size() - 1).getSearchIndex(),
+ nextExpectedSearchIndex.get(),
+ prefetchingQueue.size());
+ processBatchFromPending(batch);
+ } else {
+ // Pending queue was empty - try catch-up from WAL for any gaps
+ // (entries may have been dropped due to pending queue overflow)
+ tryCatchUpFromWAL();
+ }
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ break;
+ } catch (final Throwable t) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: CRITICAL error in prefetch loop "
+ + "(type={}, message={})",
+ this,
+ t.getClass().getName(),
+ t.getMessage(),
+ t);
+ if (t instanceof Error) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: caught Error in prefetch loop, "
+ + "will attempt to continue",
+ this);
+ }
+ try {
+ Thread.sleep(100);
+ } catch (final InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+ }
+ } catch (final Throwable fatal) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: FATAL uncaught throwable escaped prefetch loop "
+ + "(type={}, message={})",
+ this,
+ fatal.getClass().getName(),
+ fatal.getMessage(),
+ fatal);
+ }
+ LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread stopped", this);
+ }
+
+ private void processBatchFromPending(final List batch) {
+ final List batchedTablets = new ArrayList<>();
+ long batchStartSearchIndex = nextExpectedSearchIndex.get();
+ long batchEndSearchIndex = batchStartSearchIndex;
+ int processedCount = 0;
+ int skippedCount = 0;
+ int nullDeserCount = 0;
+ int emptyConvertCount = 0;
+
+ for (final IndexedConsensusRequest request : batch) {
+ final long searchIndex = request.getSearchIndex();
+
+ // Detect gap: if searchIndex > nextExpected, entries were dropped from pending queue.
+ // Fill the gap from WAL.
+ final long expected = nextExpectedSearchIndex.get();
+ if (searchIndex > expected) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: gap detected, expected={}, got={}. "
+ + "Filling {} entries from WAL.",
+ this,
+ expected,
+ searchIndex,
+ searchIndex - expected);
+ fillGapFromWAL(expected, searchIndex, batchedTablets);
+ }
+
+ if (searchIndex < nextExpectedSearchIndex.get()) {
+ // Already processed (e.g., gap fill covered this entry), skip
+ skippedCount++;
+ continue;
+ }
+
+ // Process this entry
+ final InsertNode insertNode = deserializeToInsertNode(request);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ if (!tablets.isEmpty()) {
+ batchedTablets.addAll(tablets);
+ batchEndSearchIndex = searchIndex;
+ processedCount++;
+ } else {
+ emptyConvertCount++;
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: converter returned empty tablets for "
+ + "searchIndex={}, insertNodeType={}, deviceId={}",
+ this,
+ searchIndex,
+ insertNode.getType(),
+ ConsensusLogToTabletConverter.safeDeviceIdForLog(insertNode));
+ }
+ } else {
+ nullDeserCount++;
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: deserializeToInsertNode returned null for "
+ + "searchIndex={}, requestType={}",
+ this,
+ searchIndex,
+ request.getRequests().isEmpty()
+ ? "EMPTY"
+ : request.getRequests().get(0).getClass().getSimpleName());
+ }
+ nextExpectedSearchIndex.set(searchIndex + 1);
+
+ // Flush batch if large enough
+ if (batchedTablets.size() >= MAX_TABLETS_PER_EVENT) {
+ createAndEnqueueEvent(
+ new ArrayList<>(batchedTablets), batchStartSearchIndex, batchEndSearchIndex);
+ batchedTablets.clear();
+ // Reset start index for the next sub-batch so that
+ // outstandingCommitIdToStartIndex records the correct WAL pin position
+ batchStartSearchIndex = nextExpectedSearchIndex.get();
+ }
+ }
+
+ // Update WAL reader position to stay in sync
+ syncReqIteratorPosition();
+
+ // Flush remaining tablets
+ if (!batchedTablets.isEmpty()) {
+ createAndEnqueueEvent(batchedTablets, batchStartSearchIndex, batchEndSearchIndex);
+ }
+
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: batch processing complete, "
+ + "batchSize={}, processed={}, skipped={}, nullDeser={}, emptyConvert={}, "
+ + "tabletsCreated={}, nextExpected={}, prefetchQueueSize={}",
+ this,
+ batch.size(),
+ processedCount,
+ skippedCount,
+ nullDeserCount,
+ emptyConvertCount,
+ batchedTablets.size(),
+ nextExpectedSearchIndex.get(),
+ prefetchingQueue.size());
+ }
+
+ /**
+ * Fills a gap in the pending queue by reading entries from WAL. Called when gap is detected
+ * between nextExpectedSearchIndex and an incoming entry's searchIndex.
+ */
+ private void fillGapFromWAL(
+ final long fromIndex, final long toIndex, final List batchedTablets) {
+ // Re-position WAL reader to the gap start
+ reqIterator = consensusReqReader.getReqIterator(fromIndex);
+
+ while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) {
+ try {
+ final IndexedConsensusRequest walEntry = reqIterator.next();
+ final long walIndex = walEntry.getSearchIndex();
+ if (walIndex < nextExpectedSearchIndex.get()) {
+ continue; // already processed
+ }
+
+ final InsertNode insertNode = deserializeToInsertNode(walEntry);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ batchedTablets.addAll(tablets);
+ }
+ nextExpectedSearchIndex.set(walIndex + 1);
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: error filling gap from WAL at index {}",
+ this,
+ nextExpectedSearchIndex.get(),
+ e);
+ break;
+ }
+ }
+
+ // If WAL doesn't have the gap entries yet (still in memory buffer), wait briefly
+ if (nextExpectedSearchIndex.get() < toIndex) {
+ try {
+ reqIterator.waitForNextReady(WAL_WAIT_TIMEOUT_SECONDS, TimeUnit.SECONDS);
+ while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) {
+ final IndexedConsensusRequest walEntry = reqIterator.next();
+ final long walIndex = walEntry.getSearchIndex();
+ if (walIndex < nextExpectedSearchIndex.get()) {
+ continue;
+ }
+ final InsertNode insertNode = deserializeToInsertNode(walEntry);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ batchedTablets.addAll(tablets);
+ }
+ nextExpectedSearchIndex.set(walIndex + 1);
+ }
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ } catch (final TimeoutException e) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: timeout waiting for WAL gap fill [{}, {})",
+ this,
+ nextExpectedSearchIndex.get(),
+ toIndex);
+ }
+ }
+ }
+
+ /**
+ * Try catch-up from WAL when the pending queue was empty. This handles cold-start or scenarios
+ * where the subscription started after data was already written.
+ */
+ private void tryCatchUpFromWAL() {
+ // Re-position WAL reader
+ syncReqIteratorPosition();
+
+ if (!reqIterator.hasNext()) {
+ // No data on disk either - nothing to do
+ return;
+ }
+
+ final List batchedTablets = new ArrayList<>();
+ long batchStartSearchIndex = nextExpectedSearchIndex.get();
+ long batchEndSearchIndex = batchStartSearchIndex;
+ int entriesRead = 0;
+
+ while (entriesRead < MAX_WAL_ENTRIES_PER_PREFETCH
+ && reqIterator.hasNext()
+ && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) {
+ try {
+ final IndexedConsensusRequest walEntry = reqIterator.next();
+ final long walIndex = walEntry.getSearchIndex();
+ entriesRead++;
+
+ if (walIndex < nextExpectedSearchIndex.get()) {
+ continue;
+ }
+
+ final InsertNode insertNode = deserializeToInsertNode(walEntry);
+ if (insertNode != null) {
+ final List tablets = converter.convert(insertNode);
+ if (!tablets.isEmpty()) {
+ batchedTablets.addAll(tablets);
+ batchEndSearchIndex = walIndex;
+ }
+ }
+ nextExpectedSearchIndex.set(walIndex + 1);
+
+ if (batchedTablets.size() >= MAX_TABLETS_PER_EVENT) {
+ createAndEnqueueEvent(
+ new ArrayList<>(batchedTablets), batchStartSearchIndex, batchEndSearchIndex);
+ batchedTablets.clear();
+ // Reset start index for the next sub-batch
+ batchStartSearchIndex = nextExpectedSearchIndex.get();
+ }
+ } catch (final Exception e) {
+ LOGGER.warn("ConsensusPrefetchingQueue {}: error reading WAL for catch-up", this, e);
+ break;
+ }
+ }
+
+ if (!batchedTablets.isEmpty()) {
+ createAndEnqueueEvent(batchedTablets, batchStartSearchIndex, batchEndSearchIndex);
+ }
+
+ if (entriesRead > 0) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: WAL catch-up read {} entries, "
+ + "nextExpectedSearchIndex={}",
+ this,
+ entriesRead,
+ nextExpectedSearchIndex.get());
+ }
+ }
+
+ /**
+ * Re-positions the WAL reader to the current nextExpectedSearchIndex. Called before reading from
+ * WAL to ensure the iterator is in sync with tracking position.
+ */
+ private void syncReqIteratorPosition() {
+ reqIterator = consensusReqReader.getReqIterator(nextExpectedSearchIndex.get());
+ }
+
+ /**
+ * Deserializes the IConsensusRequest entries within an IndexedConsensusRequest to produce an
+ * InsertNode. WAL entries are typically stored as IoTConsensusRequest (serialized ByteBuffers),
+ * and a single logical write may be split across multiple fragments (SearchNode). This method
+ * handles both cases.
+ *
+ * The deserialization follows the same pattern as {@code
+ * DataRegionStateMachine.grabPlanNode()}.
+ */
+ private InsertNode deserializeToInsertNode(final IndexedConsensusRequest indexedRequest) {
+ final List searchNodes = new ArrayList<>();
+ PlanNode nonSearchNode = null;
+
+ for (final IConsensusRequest req : indexedRequest.getRequests()) {
+ PlanNode planNode;
+ try {
+ if (req instanceof IoTConsensusRequest) {
+ // WAL entries read from file are wrapped as IoTConsensusRequest (ByteBuffer)
+ planNode = WALEntry.deserializeForConsensus(req.serializeToByteBuffer());
+ } else if (req instanceof InsertNode) {
+ // In-memory entries (not yet flushed to WAL file) may already be PlanNode
+ planNode = (PlanNode) req;
+ } else {
+ // ByteBufferConsensusRequest or unknown
+ planNode = PlanNodeType.deserialize(req.serializeToByteBuffer());
+ }
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: failed to deserialize IConsensusRequest "
+ + "(type={}) in searchIndex={}: {}",
+ this,
+ req.getClass().getSimpleName(),
+ indexedRequest.getSearchIndex(),
+ e.getMessage(),
+ e);
+ continue;
+ }
+
+ if (planNode instanceof SearchNode) {
+ ((SearchNode) planNode).setSearchIndex(indexedRequest.getSearchIndex());
+ searchNodes.add((SearchNode) planNode);
+ } else {
+ nonSearchNode = planNode;
+ }
+ }
+
+ // Merge split SearchNode fragments (same pattern as DataRegionStateMachine.grabPlanNode)
+ if (!searchNodes.isEmpty()) {
+ final PlanNode merged = searchNodes.get(0).merge(searchNodes);
+ if (merged instanceof InsertNode) {
+ final InsertNode mergedInsert = (InsertNode) merged;
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: deserialized merged InsertNode for searchIndex={}, "
+ + "type={}, deviceId={}, searchNodeCount={}",
+ this,
+ indexedRequest.getSearchIndex(),
+ mergedInsert.getType(),
+ ConsensusLogToTabletConverter.safeDeviceIdForLog(mergedInsert),
+ searchNodes.size());
+
+ return mergedInsert;
+ }
+ }
+
+ if (nonSearchNode != null) {
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: searchIndex={} contains non-InsertNode PlanNode: {}",
+ this,
+ indexedRequest.getSearchIndex(),
+ nonSearchNode.getClass().getSimpleName());
+ }
+
+ return null;
+ }
+
+ private void createAndEnqueueEvent(
+ final List tablets, final long startSearchIndex, final long endSearchIndex) {
+ if (tablets.isEmpty()) {
+ return;
+ }
+
+ final long commitId = commitIdGenerator.getAndIncrement();
+
+ // Record the mapping from commitId to the end searchIndex
+ // so that when the client commits, we know which WAL position has been consumed
+ commitManager.recordCommitMapping(
+ brokerId, topicName, consensusGroupId, commitId, endSearchIndex);
+
+ // Track outstanding event for WAL pinning
+ outstandingCommitIdToStartIndex.put(commitId, startSearchIndex);
+
+ final SubscriptionCommitContext commitContext =
+ new SubscriptionCommitContext(
+ IoTDBDescriptor.getInstance().getConfig().getDataNodeId(),
+ PipeDataNodeAgent.runtime().getRebootTimes(),
+ topicName,
+ brokerId,
+ commitId);
+
+ // nextOffset <= 0 means all tablets delivered in single batch
+ // -tablets.size() indicates total count
+ // Use Map> constructor with actual database name for table model;
+ final TabletsPayload payload =
+ new TabletsPayload(
+ Collections.singletonMap(converter.getDatabaseName(), tablets), -tablets.size());
+
+ final SubscriptionEvent event =
+ new SubscriptionEvent(
+ SubscriptionPollResponseType.TABLETS.getType(), payload, commitContext);
+
+ prefetchingQueue.add(event);
+
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: ENQUEUED event with {} tablets, "
+ + "searchIndex range [{}, {}], commitId={}, prefetchQueueSize={}",
+ this,
+ tablets.size(),
+ startSearchIndex,
+ endSearchIndex,
+ commitId,
+ prefetchingQueue.size());
+ }
+
+ // ======================== Commit (Ack/Nack) ========================
+
+ public boolean ack(final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ return !isClosed && ackInternal(consumerId, commitContext);
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private boolean ackInternal(
+ final String consumerId, final SubscriptionCommitContext commitContext) {
+ final AtomicBoolean acked = new AtomicBoolean(false);
+ final long commitId = commitContext.getCommitId();
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: commit context {} does not exist for ack",
+ this,
+ commitContext);
+ return null;
+ }
+
+ if (ev.isCommitted()) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: event {} already committed", this, commitContext);
+ ev.cleanUp(false);
+ return null;
+ }
+
+ ev.ack();
+ ev.recordCommittedTimestamp();
+ acked.set(true);
+
+ ev.cleanUp(false);
+ return null;
+ });
+
+ if (acked.get()) {
+ commitManager.commit(brokerId, topicName, consensusGroupId, commitId);
+ outstandingCommitIdToStartIndex.remove(commitId);
+ }
+
+ return acked.get();
+ }
+
+ public boolean nack(final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ return !isClosed && nackInternal(consumerId, commitContext);
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ /**
+ * Silent version of ack: returns false without logging if the commit context is not found. Used
+ * in multi-region iteration where only one queue owns the event.
+ */
+ public boolean ackSilent(final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return false;
+ }
+ final AtomicBoolean acked = new AtomicBoolean(false);
+ final long commitId = commitContext.getCommitId();
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ if (ev.isCommitted()) {
+ ev.cleanUp(false);
+ return null;
+ }
+ ev.ack();
+ ev.recordCommittedTimestamp();
+ acked.set(true);
+ ev.cleanUp(false);
+ return null;
+ });
+ if (acked.get()) {
+ commitManager.commit(brokerId, topicName, consensusGroupId, commitId);
+ outstandingCommitIdToStartIndex.remove(commitId);
+ }
+ return acked.get();
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ /**
+ * Silent version of nack: returns false without logging if the commit context is not found. Used
+ * in multi-region iteration where only one queue owns the event.
+ */
+ public boolean nackSilent(
+ final String consumerId, final SubscriptionCommitContext commitContext) {
+ acquireReadLock();
+ try {
+ if (isClosed) {
+ return false;
+ }
+ final AtomicBoolean nacked = new AtomicBoolean(false);
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ ev.nack();
+ nacked.set(true);
+ prefetchingQueue.add(ev);
+ return null;
+ });
+ return nacked.get();
+ } finally {
+ releaseReadLock();
+ }
+ }
+
+ private boolean nackInternal(
+ final String consumerId, final SubscriptionCommitContext commitContext) {
+ final AtomicBoolean nacked = new AtomicBoolean(false);
+ inFlightEvents.compute(
+ new Pair<>(consumerId, commitContext),
+ (key, ev) -> {
+ if (Objects.isNull(ev)) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: commit context {} does not exist for nack",
+ this,
+ commitContext);
+ return null;
+ }
+
+ ev.nack();
+ nacked.set(true);
+ prefetchingQueue.add(ev);
+ return null;
+ });
+
+ return nacked.get();
+ }
+
+ // ======================== Recycle ========================
+
+ /** Recycles in-flight events that are pollable (timed out) back to the prefetching queue. */
+ private void recycleInFlightEvents() {
+ for (final Pair key :
+ new ArrayList<>(inFlightEvents.keySet())) {
+ inFlightEvents.compute(
+ key,
+ (k, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ if (ev.isCommitted()) {
+ ev.cleanUp(false);
+ return null;
+ }
+ if (ev.pollable()) {
+ ev.nack();
+ prefetchingQueue.add(ev);
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: recycled timed-out event {} back to prefetching queue",
+ this,
+ ev);
+ return null;
+ }
+ return ev;
+ });
+ }
+ }
+
+ /**
+ * Maximum number of nack cycles before an in-flight event is kept in place rather than
+ * re-enqueued. Prevents infinite re-delivery loops when a consumer repeatedly polls without
+ * committing. Beyond this threshold, the event stays in inFlightEvents and will eventually be
+ * recycled by the timeout-based {@link #recycleInFlightEvents()} when it becomes pollable.
+ */
+ private static final long MAX_CONSUMER_RECYCLE_NACK_COUNT = 10;
+
+ /**
+ * Recycles uncommitted in-flight events belonging to the given consumer back to the prefetching
+ * queue. This provides at-least-once delivery: when a consumer polls again without committing,
+ * the previously delivered events are nacked and re-queued for re-delivery.
+ *
+ * Events that have been nacked more than {@link #MAX_CONSUMER_RECYCLE_NACK_COUNT} times are
+ * left in-flight to avoid infinite re-delivery loops. They will be cleaned up by the periodic
+ * timeout-based recycler instead.
+ *
+ * @return the number of events recycled
+ */
+ private int recycleInFlightEventsForConsumer(final String consumerId) {
+ final AtomicInteger count = new AtomicInteger(0);
+ for (final Pair key :
+ new ArrayList<>(inFlightEvents.keySet())) {
+ if (!key.getLeft().equals(consumerId)) {
+ continue;
+ }
+ inFlightEvents.compute(
+ key,
+ (k, ev) -> {
+ if (Objects.isNull(ev)) {
+ return null;
+ }
+ if (ev.isCommitted()) {
+ ev.cleanUp(false);
+ return null;
+ }
+ // If the event has been nacked too many times, leave it and let the timeout recycler
+ // handle it.
+ if (ev.getNackCount() >= MAX_CONSUMER_RECYCLE_NACK_COUNT) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: event {} for consumer {} exceeded max nack "
+ + "count ({}), skipping recycle to prevent infinite loop",
+ this,
+ ev,
+ consumerId,
+ MAX_CONSUMER_RECYCLE_NACK_COUNT);
+ return ev; // keep in inFlightEvents
+ }
+ ev.nack();
+ prefetchingQueue.add(ev);
+ count.incrementAndGet();
+ LOGGER.debug(
+ "ConsensusPrefetchingQueue {}: recycled uncommitted event {} for consumer {} "
+ + "back to prefetching queue",
+ this,
+ ev,
+ consumerId);
+ return null;
+ });
+ }
+ return count.get();
+ }
+
+ // ======================== Cleanup ========================
+
+ public void cleanUp() {
+ acquireWriteLock();
+ try {
+ prefetchingQueue.forEach(event -> event.cleanUp(true));
+ prefetchingQueue.clear();
+
+ inFlightEvents.values().forEach(event -> event.cleanUp(true));
+ inFlightEvents.clear();
+ } finally {
+ releaseWriteLock();
+ }
+ }
+
+ public void close() {
+ markClosed();
+ // Stop background prefetch thread
+ prefetchThread.interrupt();
+ try {
+ prefetchThread.join(5000);
+ } catch (final InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ // Unregister from IoTConsensusServerImpl (stop receiving in-memory data, unpin WAL).
+ serverImpl.unregisterSubscriptionQueue(pendingEntries, walPinSupplier);
+ cleanUp();
+ // Persist progress before closing
+ commitManager.persistAll();
+ }
+
+ private SubscriptionEvent generateErrorResponse(final String errorMessage) {
+ return new SubscriptionEvent(
+ SubscriptionPollResponseType.ERROR.getType(),
+ new ErrorPayload(errorMessage, false),
+ new SubscriptionCommitContext(
+ IoTDBDescriptor.getInstance().getConfig().getDataNodeId(),
+ PipeDataNodeAgent.runtime().getRebootTimes(),
+ topicName,
+ brokerId,
+ INVALID_COMMIT_ID));
+ }
+
+ private SubscriptionEvent generateOutdatedErrorResponse() {
+ return new SubscriptionEvent(
+ SubscriptionPollResponseType.ERROR.getType(),
+ ErrorPayload.OUTDATED_ERROR_PAYLOAD,
+ new SubscriptionCommitContext(
+ IoTDBDescriptor.getInstance().getConfig().getDataNodeId(),
+ PipeDataNodeAgent.runtime().getRebootTimes(),
+ topicName,
+ brokerId,
+ INVALID_COMMIT_ID));
+ }
+
+ public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) {
+ return PipeDataNodeAgent.runtime().getRebootTimes() > commitContext.getRebootTimes()
+ || initialCommitId > commitContext.getCommitId();
+ }
+
+ // ======================== Status ========================
+
+ public boolean isClosed() {
+ return isClosed;
+ }
+
+ public void markClosed() {
+ isClosed = true;
+ }
+
+ public String getPrefetchingQueueId() {
+ return brokerId + "_" + topicName;
+ }
+
+ public long getSubscriptionUncommittedEventCount() {
+ return inFlightEvents.size();
+ }
+
+ public long getCurrentCommitId() {
+ return commitIdGenerator.get();
+ }
+
+ public int getPrefetchedEventCount() {
+ return prefetchingQueue.size();
+ }
+
+ public long getCurrentReadSearchIndex() {
+ return nextExpectedSearchIndex.get();
+ }
+
+ public String getBrokerId() {
+ return brokerId;
+ }
+
+ public String getTopicName() {
+ return topicName;
+ }
+
+ public String getConsensusGroupId() {
+ return consensusGroupId;
+ }
+
+ // ======================== Stringify ========================
+
+ public Map coreReportMessage() {
+ final Map result = new HashMap<>();
+ result.put("brokerId", brokerId);
+ result.put("topicName", topicName);
+ result.put("consensusGroupId", consensusGroupId);
+ result.put("currentReadSearchIndex", String.valueOf(nextExpectedSearchIndex.get()));
+ result.put("prefetchingQueueSize", String.valueOf(prefetchingQueue.size()));
+ result.put("inFlightEventsSize", String.valueOf(inFlightEvents.size()));
+ result.put("outstandingEventsSize", String.valueOf(outstandingCommitIdToStartIndex.size()));
+ result.put("pendingEntriesSize", String.valueOf(pendingEntries.size()));
+ result.put("commitIdGenerator", commitIdGenerator.toString());
+ result.put("isClosed", String.valueOf(isClosed));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "ConsensusPrefetchingQueue" + coreReportMessage();
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java
new file mode 100644
index 0000000000000..4096394ad6a33
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java
@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.db.conf.IoTDBDescriptor;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * Manages commit state for consensus-based subscriptions.
+ *
+ * This manager tracks which events have been committed by consumers and maps commit IDs back to
+ * WAL search indices. It maintains the progress for each (consumerGroup, topic, region) triple and
+ * supports persistence and recovery.
+ *
+ *
Progress is tracked per-region because searchIndex is region-local — each DataRegion
+ * has its own independent WAL with its own searchIndex namespace. Using a single state per topic
+ * would cause TreeSet deduplication bugs when different regions emit the same searchIndex value.
+ *
+ *
Key responsibilities:
+ *
+ *
+ * Track the mapping from commitId to searchIndex
+ * Handle commit/ack from consumers
+ * Persist and recover progress state
+ *
+ */
+public class ConsensusSubscriptionCommitManager {
+
+ private static final Logger LOGGER =
+ LoggerFactory.getLogger(ConsensusSubscriptionCommitManager.class);
+
+ private static final String PROGRESS_FILE_PREFIX = "consensus_subscription_progress_";
+ private static final String PROGRESS_FILE_SUFFIX = ".dat";
+
+ /** Key: "consumerGroupId_topicName_regionId" -> progress tracking state */
+ private final Map commitStates =
+ new ConcurrentHashMap<>();
+
+ private final String persistDir;
+
+ private ConsensusSubscriptionCommitManager() {
+ this.persistDir =
+ IoTDBDescriptor.getInstance().getConfig().getSystemDir()
+ + File.separator
+ + "subscription"
+ + File.separator
+ + "consensus_progress";
+ final File dir = new File(persistDir);
+ if (!dir.exists()) {
+ dir.mkdirs();
+ }
+ }
+
+ /**
+ * Gets or creates the commit state for a specific (consumerGroup, topic, region) triple.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @return the commit state
+ */
+ public ConsensusSubscriptionCommitState getOrCreateState(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ return commitStates.computeIfAbsent(
+ key,
+ k -> {
+ // Try to recover from persisted state
+ final ConsensusSubscriptionCommitState recovered = tryRecover(key);
+ if (recovered != null) {
+ return recovered;
+ }
+ return new ConsensusSubscriptionCommitState(new SubscriptionConsensusProgress(0L, 0L));
+ });
+ }
+
+ /**
+ * Records commitId to searchIndex mapping for later commit handling.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @param commitId the assigned commit ID
+ * @param searchIndex the WAL search index corresponding to this event
+ */
+ public void recordCommitMapping(
+ final String consumerGroupId,
+ final String topicName,
+ final String regionId,
+ final long commitId,
+ final long searchIndex) {
+ final ConsensusSubscriptionCommitState state =
+ getOrCreateState(consumerGroupId, topicName, regionId);
+ state.recordMapping(commitId, searchIndex);
+ }
+
+ /**
+ * Handles commit (ack) for an event. Updates the progress and potentially advances the committed
+ * search index.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @param commitId the committed event's commit ID
+ * @return true if commit handled successfully
+ */
+ public boolean commit(
+ final String consumerGroupId,
+ final String topicName,
+ final String regionId,
+ final long commitId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ final ConsensusSubscriptionCommitState state = commitStates.get(key);
+ if (state == null) {
+ LOGGER.warn(
+ "ConsensusSubscriptionCommitManager: Cannot commit for unknown state, "
+ + "consumerGroupId={}, topicName={}, regionId={}, commitId={}",
+ consumerGroupId,
+ topicName,
+ regionId,
+ commitId);
+ return false;
+ }
+ final boolean success = state.commit(commitId);
+ if (success) {
+ // Periodically persist progress
+ persistProgressIfNeeded(key, state);
+ }
+ return success;
+ }
+
+ /**
+ * Gets the current committed search index for a specific region's state.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ * @return the committed search index, or -1 if no state exists
+ */
+ public long getCommittedSearchIndex(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ final ConsensusSubscriptionCommitState state = commitStates.get(key);
+ if (state == null) {
+ return -1;
+ }
+ return state.getCommittedSearchIndex();
+ }
+
+ /**
+ * Removes state for a specific (consumerGroup, topic, region) triple.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ * @param regionId the consensus group / data region ID string
+ */
+ public void removeState(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ final String key = generateKey(consumerGroupId, topicName, regionId);
+ commitStates.remove(key);
+ // Clean up persisted file
+ final File file = getProgressFile(key);
+ if (file.exists()) {
+ file.delete();
+ }
+ }
+
+ /**
+ * Removes all states for a given (consumerGroup, topic) pair across all regions. Used during
+ * subscription teardown when the individual regionIds may not be readily available.
+ *
+ * @param consumerGroupId the consumer group ID
+ * @param topicName the topic name
+ */
+ public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) {
+ final String prefix = consumerGroupId + "_" + topicName + "_";
+ final Iterator> it =
+ commitStates.entrySet().iterator();
+ while (it.hasNext()) {
+ final Map.Entry entry = it.next();
+ if (entry.getKey().startsWith(prefix)) {
+ it.remove();
+ final File file = getProgressFile(entry.getKey());
+ if (file.exists()) {
+ file.delete();
+ }
+ }
+ }
+ }
+
+ /** Persists all states. Should be called during graceful shutdown. */
+ public void persistAll() {
+ for (final Map.Entry entry :
+ commitStates.entrySet()) {
+ persistProgress(entry.getKey(), entry.getValue());
+ }
+ }
+
+ // ======================== Helper Methods ========================
+
+ private String generateKey(
+ final String consumerGroupId, final String topicName, final String regionId) {
+ return consumerGroupId + "_" + topicName + "_" + regionId;
+ }
+
+ private File getProgressFile(final String key) {
+ return new File(persistDir, PROGRESS_FILE_PREFIX + key + PROGRESS_FILE_SUFFIX);
+ }
+
+ private ConsensusSubscriptionCommitState tryRecover(final String key) {
+ final File file = getProgressFile(key);
+ if (!file.exists()) {
+ return null;
+ }
+ try (final FileInputStream fis = new FileInputStream(file)) {
+ final byte[] bytes = new byte[(int) file.length()];
+ fis.read(bytes);
+ final ByteBuffer buffer = ByteBuffer.wrap(bytes);
+ return ConsensusSubscriptionCommitState.deserialize(buffer);
+ } catch (final IOException e) {
+ LOGGER.warn("Failed to recover consensus subscription progress from {}", file, e);
+ return null;
+ }
+ }
+
+ private void persistProgressIfNeeded(
+ final String key, final ConsensusSubscriptionCommitState state) {
+ // Persist every 100 commits to reduce disk IO
+ if (state.getProgress().getCommitIndex() % 100 == 0) {
+ persistProgress(key, state);
+ }
+ }
+
+ private void persistProgress(final String key, final ConsensusSubscriptionCommitState state) {
+ final File file = getProgressFile(key);
+ try (final FileOutputStream fos = new FileOutputStream(file);
+ final DataOutputStream dos = new DataOutputStream(fos)) {
+ state.serialize(dos);
+ dos.flush();
+ } catch (final IOException e) {
+ LOGGER.warn("Failed to persist consensus subscription progress to {}", file, e);
+ }
+ }
+
+ // ======================== Inner State Class ========================
+
+ /**
+ * Tracks commit state for a single (consumerGroup, topic, region) triple. Maintains the mapping
+ * from commitId to searchIndex and tracks committed progress within one region's WAL.
+ */
+ public static class ConsensusSubscriptionCommitState {
+
+ private final SubscriptionConsensusProgress progress;
+
+ /**
+ * Maps commitId -> searchIndex. Records which WAL search index corresponds to each committed
+ * event. Entries are removed once committed.
+ */
+ private final Map commitIdToSearchIndex = new ConcurrentHashMap<>();
+
+ /**
+ * Tracks the safe recovery position: the highest search index where all prior dispatched events
+ * have been committed. Only advances contiguously — never jumps over uncommitted gaps.
+ */
+ private volatile long committedSearchIndex;
+
+ /**
+ * Tracks the maximum search index among all committed events (may be ahead of
+ * committedSearchIndex when out-of-order commits exist). Used to update committedSearchIndex
+ * once all outstanding events are committed.
+ */
+ private long maxCommittedSearchIndex;
+
+ /**
+ * Tracks search indices of dispatched but not-yet-committed events. Used to prevent
+ * committedSearchIndex from jumping over uncommitted gaps. On commit, the frontier advances to
+ * min(outstanding) - 1 (or maxCommittedSearchIndex if empty).
+ *
+ * Since state is now per-region, searchIndex values within this set are guaranteed unique
+ * (they come from a single region's monotonically increasing WAL searchIndex).
+ */
+ private final TreeSet outstandingSearchIndices = new TreeSet<>();
+
+ public ConsensusSubscriptionCommitState(final SubscriptionConsensusProgress progress) {
+ this.progress = progress;
+ this.committedSearchIndex = progress.getSearchIndex();
+ this.maxCommittedSearchIndex = progress.getSearchIndex();
+ }
+
+ public SubscriptionConsensusProgress getProgress() {
+ return progress;
+ }
+
+ public long getCommittedSearchIndex() {
+ return committedSearchIndex;
+ }
+
+ /** Threshold for warning about outstanding (uncommitted) search indices accumulation. */
+ private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000;
+
+ public void recordMapping(final long commitId, final long searchIndex) {
+ commitIdToSearchIndex.put(commitId, searchIndex);
+ synchronized (this) {
+ outstandingSearchIndices.add(searchIndex);
+ final int size = outstandingSearchIndices.size();
+ if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) {
+ LOGGER.warn(
+ "ConsensusSubscriptionCommitState: outstandingSearchIndices size ({}) exceeds "
+ + "threshold ({}), consumers may not be committing. committedSearchIndex={}, "
+ + "maxCommittedSearchIndex={}, commitIdToSearchIndex size={}",
+ size,
+ OUTSTANDING_SIZE_WARN_THRESHOLD,
+ committedSearchIndex,
+ maxCommittedSearchIndex,
+ commitIdToSearchIndex.size());
+ }
+ }
+ }
+
+ /**
+ * Commits the specified event and advances the committed search index contiguously.
+ *
+ * The committed search index only advances to a position where all prior dispatched events
+ * have been committed. This prevents the recovery position from jumping over uncommitted gaps,
+ * ensuring at-least-once delivery even after crash recovery.
+ *
+ * @param commitId the commit ID to commit
+ * @return true if successfully committed
+ */
+ public boolean commit(final long commitId) {
+ final Long searchIndex = commitIdToSearchIndex.remove(commitId);
+ if (searchIndex == null) {
+ LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId);
+ return false;
+ }
+
+ progress.incrementCommitIndex();
+
+ // Advance committed search index contiguously (gap-aware)
+ synchronized (this) {
+ outstandingSearchIndices.remove(searchIndex);
+ if (searchIndex > maxCommittedSearchIndex) {
+ maxCommittedSearchIndex = searchIndex;
+ }
+
+ if (outstandingSearchIndices.isEmpty()) {
+ // All dispatched events have been committed — advance to the max
+ committedSearchIndex = maxCommittedSearchIndex;
+ } else {
+ // Advance to just below the earliest uncommitted event
+ // (never go backward)
+ committedSearchIndex =
+ Math.max(committedSearchIndex, outstandingSearchIndices.first() - 1);
+ }
+ progress.setSearchIndex(committedSearchIndex);
+ }
+
+ return true;
+ }
+
+ public void serialize(final DataOutputStream stream) throws IOException {
+ progress.serialize(stream);
+ stream.writeLong(committedSearchIndex);
+ }
+
+ public static ConsensusSubscriptionCommitState deserialize(final ByteBuffer buffer) {
+ final SubscriptionConsensusProgress progress =
+ SubscriptionConsensusProgress.deserialize(buffer);
+ final ConsensusSubscriptionCommitState state = new ConsensusSubscriptionCommitState(progress);
+ state.committedSearchIndex = buffer.getLong();
+ state.maxCommittedSearchIndex = state.committedSearchIndex;
+ return state;
+ }
+ }
+
+ // ======================== Singleton ========================
+
+ private static class Holder {
+ private static final ConsensusSubscriptionCommitManager INSTANCE =
+ new ConsensusSubscriptionCommitManager();
+ }
+
+ public static ConsensusSubscriptionCommitManager getInstance() {
+ return Holder.INSTANCE;
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java
new file mode 100644
index 0000000000000..b138dbceef1a2
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java
@@ -0,0 +1,422 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.iotdb.commons.consensus.ConsensusGroupId;
+import org.apache.iotdb.commons.consensus.DataRegionId;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.IoTDBTreePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.PrefixTreePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern;
+import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern;
+import org.apache.iotdb.consensus.IConsensus;
+import org.apache.iotdb.consensus.iot.IoTConsensus;
+import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl;
+import org.apache.iotdb.db.conf.IoTDBConfig;
+import org.apache.iotdb.db.conf.IoTDBDescriptor;
+import org.apache.iotdb.db.consensus.DataRegionConsensusImpl;
+import org.apache.iotdb.db.storageengine.StorageEngine;
+import org.apache.iotdb.db.storageengine.dataregion.DataRegion;
+import org.apache.iotdb.db.subscription.agent.SubscriptionAgent;
+import org.apache.iotdb.rpc.subscription.config.TopicConfig;
+import org.apache.iotdb.rpc.subscription.config.TopicConstant;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Handles the setup and teardown of consensus-based subscription queues on DataNode. When a
+ * real-time subscription is detected, this handler finds the local IoTConsensus data regions,
+ * creates the appropriate converter, and binds prefetching queues to the subscription broker.
+ */
+public class ConsensusSubscriptionSetupHandler {
+
+ private static final Logger LOGGER =
+ LoggerFactory.getLogger(ConsensusSubscriptionSetupHandler.class);
+
+ private static final IoTDBConfig IOTDB_CONFIG = IoTDBDescriptor.getInstance().getConfig();
+
+ private ConsensusSubscriptionSetupHandler() {
+ // utility class
+ }
+
+ /**
+ * Ensures that the IoTConsensus new-peer callback is set, so that when a new DataRegion is
+ * created, all active consensus subscriptions are automatically bound to the new region.
+ */
+ public static void ensureNewRegionListenerRegistered() {
+ if (IoTConsensus.onNewPeerCreated != null) {
+ return;
+ }
+ IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated;
+ LOGGER.info(
+ "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding");
+ }
+
+ /**
+ * Callback invoked when a new DataRegion (IoTConsensusServerImpl) is created locally. Queries
+ * existing subscription metadata to find all active consensus subscriptions and binds prefetching
+ * queues to the new region.
+ */
+ private static void onNewRegionCreated(
+ final ConsensusGroupId groupId, final IoTConsensusServerImpl serverImpl) {
+ if (!(groupId instanceof DataRegionId)) {
+ return;
+ }
+
+ // Query existing metadata keepers for all active subscriptions
+ final Map> allSubscriptions =
+ SubscriptionAgent.consumer().getAllSubscriptions();
+ if (allSubscriptions.isEmpty()) {
+ return;
+ }
+
+ final ConsensusSubscriptionCommitManager commitManager =
+ ConsensusSubscriptionCommitManager.getInstance();
+ final long startSearchIndex = serverImpl.getSearchIndex() + 1;
+
+ LOGGER.info(
+ "New DataRegion {} created, checking {} consumer group(s) for auto-binding, "
+ + "startSearchIndex={}",
+ groupId,
+ allSubscriptions.size(),
+ startSearchIndex);
+
+ for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) {
+ final String consumerGroupId = groupEntry.getKey();
+ for (final String topicName : groupEntry.getValue()) {
+ if (!isConsensusBasedTopic(topicName)) {
+ continue;
+ }
+ try {
+ final Map topicConfigs =
+ SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName));
+ final TopicConfig topicConfig = topicConfigs.get(topicName);
+ if (topicConfig == null) {
+ continue;
+ }
+
+ // Resolve the new DataRegion's actual database name
+ final DataRegion dataRegion =
+ StorageEngine.getInstance().getDataRegion((DataRegionId) groupId);
+ if (dataRegion == null) {
+ continue;
+ }
+ final String dbRaw = dataRegion.getDatabaseName();
+ final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw;
+
+ // For table topics, skip if this region's database doesn't match the topic filter
+ if (topicConfig.isTableTopic()) {
+ final String topicDb =
+ topicConfig.getStringOrDefault(
+ TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE);
+ if (topicDb != null
+ && !topicDb.isEmpty()
+ && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDb)
+ && !topicDb.equalsIgnoreCase(dbTableModel)) {
+ continue;
+ }
+ }
+
+ final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null;
+ final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName);
+
+ LOGGER.info(
+ "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} (database={})",
+ topicName,
+ consumerGroupId,
+ groupId,
+ dbTableModel);
+
+ SubscriptionAgent.broker()
+ .bindConsensusPrefetchingQueue(
+ consumerGroupId,
+ topicName,
+ groupId.toString(),
+ serverImpl,
+ converter,
+ commitManager,
+ startSearchIndex);
+ } catch (final Exception e) {
+ LOGGER.error(
+ "Failed to auto-bind topic [{}] in group [{}] to new region {}",
+ topicName,
+ consumerGroupId,
+ groupId,
+ e);
+ }
+ }
+ }
+ }
+
+ public static boolean isConsensusBasedTopic(final String topicName) {
+ try {
+ final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName);
+ final String topicFormat = SubscriptionAgent.topic().getTopicFormat(topicName);
+ final boolean result =
+ TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode)
+ && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat);
+ LOGGER.info(
+ "isConsensusBasedTopic check for topic [{}]: mode={}, format={}, result={}",
+ topicName,
+ topicMode,
+ topicFormat,
+ result);
+ return result;
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "Failed to check if topic [{}] is consensus-based, defaulting to false", topicName, e);
+ return false;
+ }
+ }
+
+ public static void setupConsensusSubscriptions(
+ final String consumerGroupId, final Set topicNames) {
+ final IConsensus dataRegionConsensus = DataRegionConsensusImpl.getInstance();
+ if (!(dataRegionConsensus instanceof IoTConsensus)) {
+ LOGGER.warn(
+ "Data region consensus is not IoTConsensus (actual: {}), "
+ + "cannot set up consensus-based subscription for consumer group [{}]",
+ dataRegionConsensus.getClass().getSimpleName(),
+ consumerGroupId);
+ return;
+ }
+
+ // Ensure the new-region listener is registered (idempotent)
+ ensureNewRegionListenerRegistered();
+
+ final IoTConsensus ioTConsensus = (IoTConsensus) dataRegionConsensus;
+ final ConsensusSubscriptionCommitManager commitManager =
+ ConsensusSubscriptionCommitManager.getInstance();
+
+ LOGGER.info(
+ "Setting up consensus subscriptions for consumer group [{}], topics={}, "
+ + "total consensus groups={}",
+ consumerGroupId,
+ topicNames,
+ ioTConsensus.getAllConsensusGroupIds().size());
+
+ for (final String topicName : topicNames) {
+ if (!isConsensusBasedTopic(topicName)) {
+ continue;
+ }
+
+ try {
+ setupConsensusQueueForTopic(consumerGroupId, topicName, ioTConsensus, commitManager);
+ } catch (final Exception e) {
+ LOGGER.error(
+ "Failed to set up consensus subscription for topic [{}] in consumer group [{}]",
+ topicName,
+ consumerGroupId,
+ e);
+ }
+ }
+ }
+
+ /**
+ * Set up consensus queue for a single topic. Discovers all local data region consensus groups and
+ * binds a ConsensusReqReader-based prefetching queue to every matching region.
+ *
+ * For table-model topics, only regions whose database matches the topic's {@code DATABASE_KEY}
+ * filter are bound. For tree-model topics, all local data regions are bound. Additionally, the
+ * {@link #onNewRegionCreated} callback ensures that regions created after this method runs are
+ * also automatically bound.
+ */
+ private static void setupConsensusQueueForTopic(
+ final String consumerGroupId,
+ final String topicName,
+ final IoTConsensus ioTConsensus,
+ final ConsensusSubscriptionCommitManager commitManager) {
+
+ // Get topic config for building the converter
+ final Map topicConfigs =
+ SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName));
+ final TopicConfig topicConfig = topicConfigs.get(topicName);
+ if (topicConfig == null) {
+ LOGGER.warn(
+ "Topic config not found for topic [{}], cannot set up consensus queue", topicName);
+ return;
+ }
+
+ // Build the converter based on topic config (path pattern, time range, tree/table model)
+ LOGGER.info(
+ "Setting up consensus queue for topic [{}]: isTableTopic={}, config={}",
+ topicName,
+ topicConfig.isTableTopic(),
+ topicConfig.getAttribute());
+
+ // For table topics, extract the database filter from topic config
+ final String topicDatabaseFilter =
+ topicConfig.isTableTopic()
+ ? topicConfig.getStringOrDefault(
+ TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE)
+ : null;
+
+ final List allGroupIds = ioTConsensus.getAllConsensusGroupIds();
+ LOGGER.info(
+ "Discovered {} consensus group(s) for topic [{}] in consumer group [{}]: {}",
+ allGroupIds.size(),
+ topicName,
+ consumerGroupId,
+ allGroupIds);
+ boolean bound = false;
+
+ for (final ConsensusGroupId groupId : allGroupIds) {
+ if (!(groupId instanceof DataRegionId)) {
+ continue;
+ }
+
+ final IoTConsensusServerImpl serverImpl = ioTConsensus.getImpl(groupId);
+ if (serverImpl == null) {
+ continue;
+ }
+
+ // Resolve the DataRegion's actual database name
+ final DataRegion dataRegion =
+ StorageEngine.getInstance().getDataRegion((DataRegionId) groupId);
+ if (dataRegion == null) {
+ continue;
+ }
+ final String dbRaw = dataRegion.getDatabaseName();
+ final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw;
+
+ if (topicDatabaseFilter != null
+ && !topicDatabaseFilter.isEmpty()
+ && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDatabaseFilter)
+ && !topicDatabaseFilter.equalsIgnoreCase(dbTableModel)) {
+ LOGGER.info(
+ "Skipping region {} (database={}) for table topic [{}] (DATABASE_KEY={})",
+ groupId,
+ dbTableModel,
+ topicName,
+ topicDatabaseFilter);
+ continue;
+ }
+
+ final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null;
+ final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName);
+
+ final long startSearchIndex = serverImpl.getSearchIndex() + 1;
+
+ LOGGER.info(
+ "Binding consensus prefetching queue for topic [{}] in consumer group [{}] "
+ + "to data region consensus group [{}] (database={}), startSearchIndex={}",
+ topicName,
+ consumerGroupId,
+ groupId,
+ dbTableModel,
+ startSearchIndex);
+
+ SubscriptionAgent.broker()
+ .bindConsensusPrefetchingQueue(
+ consumerGroupId,
+ topicName,
+ groupId.toString(),
+ serverImpl,
+ converter,
+ commitManager,
+ startSearchIndex);
+
+ bound = true;
+ }
+
+ if (!bound) {
+ LOGGER.warn(
+ "No local IoTConsensus data region found for topic [{}] in consumer group [{}]. "
+ + "Consensus subscription will be set up when a matching data region becomes available.",
+ topicName,
+ consumerGroupId);
+ }
+ }
+
+ private static ConsensusLogToTabletConverter buildConverter(
+ final TopicConfig topicConfig, final String actualDatabaseName) {
+ // Determine tree or table model
+ final boolean isTableTopic = topicConfig.isTableTopic();
+
+ TreePattern treePattern = null;
+ TablePattern tablePattern = null;
+
+ if (isTableTopic) {
+ // Table model: database + table name pattern
+ final String database =
+ topicConfig.getStringOrDefault(
+ TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE);
+ final String table =
+ topicConfig.getStringOrDefault(
+ TopicConstant.TABLE_KEY, TopicConstant.TABLE_DEFAULT_VALUE);
+ tablePattern = new TablePattern(true, database, table);
+ } else {
+ // Tree model: path or pattern
+ if (topicConfig.getAttribute().containsKey(TopicConstant.PATTERN_KEY)) {
+ final String pattern = topicConfig.getAttribute().get(TopicConstant.PATTERN_KEY);
+ treePattern = new PrefixTreePattern(pattern);
+ } else {
+ final String path =
+ topicConfig.getStringOrDefault(
+ TopicConstant.PATH_KEY, TopicConstant.PATH_DEFAULT_VALUE);
+ treePattern = new IoTDBTreePattern(path);
+ }
+ }
+
+ return new ConsensusLogToTabletConverter(treePattern, tablePattern, actualDatabaseName);
+ }
+
+ public static void teardownConsensusSubscriptions(
+ final String consumerGroupId, final Set topicNames) {
+ for (final String topicName : topicNames) {
+ try {
+ SubscriptionAgent.broker().unbindConsensusPrefetchingQueue(consumerGroupId, topicName);
+
+ // Clean up commit state for all regions of this topic
+ ConsensusSubscriptionCommitManager.getInstance()
+ .removeAllStatesForTopic(consumerGroupId, topicName);
+
+ LOGGER.info(
+ "Tore down consensus subscription for topic [{}] in consumer group [{}]",
+ topicName,
+ consumerGroupId);
+ } catch (final Exception e) {
+ LOGGER.warn(
+ "Failed to tear down consensus subscription for topic [{}] in consumer group [{}]",
+ topicName,
+ consumerGroupId,
+ e);
+ }
+ }
+ }
+
+ public static void handleNewSubscriptions(
+ final String consumerGroupId, final Set newTopicNames) {
+ if (newTopicNames == null || newTopicNames.isEmpty()) {
+ return;
+ }
+
+ LOGGER.info(
+ "Checking new subscriptions in consumer group [{}] for consensus-based topics: {}",
+ consumerGroupId,
+ newTopicNames);
+
+ setupConsensusSubscriptions(consumerGroupId, newTopicNames);
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java
new file mode 100644
index 0000000000000..0bd526e8dbaa0
--- /dev/null
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.db.subscription.broker.consensus;
+
+import org.apache.tsfile.utils.ReadWriteIOUtils;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Objects;
+
+/**
+ * Tracks consensus subscription consumption progress for a single (consumerGroup, topic, region)
+ * combination.
+ *
+ * Since searchIndex is region-local (each DataRegion has its own independent WAL and searchIndex
+ * namespace), progress is tracked per-region :
+ *
+ *
+ * searchIndex : The committed WAL search index — the highest position where all prior
+ * dispatched events have been acknowledged. Used as the recovery start point after crash.
+ * commitIndex : Monotonically increasing count of committed events. Used for
+ * persistence throttling and diagnostics.
+ *
+ */
+public class SubscriptionConsensusProgress {
+
+ private long searchIndex;
+
+ private long commitIndex;
+
+ public SubscriptionConsensusProgress() {
+ this(0L, 0L);
+ }
+
+ public SubscriptionConsensusProgress(final long searchIndex, final long commitIndex) {
+ this.searchIndex = searchIndex;
+ this.commitIndex = commitIndex;
+ }
+
+ public long getSearchIndex() {
+ return searchIndex;
+ }
+
+ public void setSearchIndex(final long searchIndex) {
+ this.searchIndex = searchIndex;
+ }
+
+ public long getCommitIndex() {
+ return commitIndex;
+ }
+
+ public void setCommitIndex(final long commitIndex) {
+ this.commitIndex = commitIndex;
+ }
+
+ public void incrementCommitIndex() {
+ this.commitIndex++;
+ }
+
+ public void serialize(final DataOutputStream stream) throws IOException {
+ ReadWriteIOUtils.write(searchIndex, stream);
+ ReadWriteIOUtils.write(commitIndex, stream);
+ }
+
+ public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) {
+ final long searchIndex = ReadWriteIOUtils.readLong(buffer);
+ final long commitIndex = ReadWriteIOUtils.readLong(buffer);
+ return new SubscriptionConsensusProgress(searchIndex, commitIndex);
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o;
+ return searchIndex == that.searchIndex && commitIndex == that.commitIndex;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(searchIndex, commitIndex);
+ }
+
+ @Override
+ public String toString() {
+ return "SubscriptionConsensusProgress{"
+ + "searchIndex="
+ + searchIndex
+ + ", commitIndex="
+ + commitIndex
+ + '}';
+ }
+}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java
index dfadee5908fa5..9ede61fbffe74 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java
@@ -248,6 +248,11 @@ public void nack() {
}
}
+ /** Returns the current nack count for this event. */
+ public long getNackCount() {
+ return nackCount.get();
+ }
+
public void recordLastPolledConsumerId(final String consumerId) {
lastPolledConsumerId = consumerId;
}
diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java
index c7e7fea8d12f8..9e9c898e3c064 100644
--- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java
+++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java
@@ -30,7 +30,7 @@ public class SubscriptionConfig {
private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig();
public boolean getSubscriptionEnabled() {
- return false;
+ return true; // TODO: make it configurable after subscription is stable
}
public float getSubscriptionCacheMemoryUsagePercentage() {
diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java
index 4393ef8a6cf61..9f66b48210bc2 100644
--- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java
+++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java
@@ -115,6 +115,26 @@ private boolean shouldRecordSubscriptionCreationTime() {
return unsubscribedTopicNames;
}
+ public static Set getTopicsNewlySubByGroup(
+ final ConsumerGroupMeta currentMeta, final ConsumerGroupMeta updatedMeta) {
+ if (!Objects.equals(currentMeta.consumerGroupId, updatedMeta.consumerGroupId)
+ || !Objects.equals(currentMeta.creationTime, updatedMeta.creationTime)) {
+ return Collections.emptySet();
+ }
+
+ final Set newlySubscribedTopicNames = new HashSet<>();
+ updatedMeta
+ .topicNameToSubscribedConsumerIdSet
+ .keySet()
+ .forEach(
+ topicName -> {
+ if (!currentMeta.topicNameToSubscribedConsumerIdSet.containsKey(topicName)) {
+ newlySubscribedTopicNames.add(topicName);
+ }
+ });
+ return newlySubscribedTopicNames;
+ }
+
/////////////////////////////// consumer ///////////////////////////////
public void checkAuthorityBeforeJoinConsumerGroup(final ConsumerMeta consumerMeta)
@@ -171,6 +191,11 @@ public ConsumerMeta getConsumerMeta(final String consumerId) {
////////////////////////// subscription //////////////////////////
+ /** Get all topic names subscribed by this consumer group. */
+ public Set getSubscribedTopicNames() {
+ return Collections.unmodifiableSet(topicNameToSubscribedConsumerIdSet.keySet());
+ }
+
/**
* Get the consumers subscribing the given topic in this group.
*
From 36e3491dbce10884c570bef2fa7bc902aff938a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=A9=AC=E5=AD=90=E5=9D=A4?=
<55695098+DanielWang2035@users.noreply.github.com>
Date: Tue, 3 Mar 2026 18:59:10 +0800
Subject: [PATCH 2/2] fix some issues
---
.../iotdb/ConsensusSubscriptionTableTest.java | 985 +++++++--------
.../iotdb/ConsensusSubscriptionTest.java | 1062 +++++++----------
.../iotdb/consensus/iot/IoTConsensus.java | 19 +
.../consensus/iot/IoTConsensusServerImpl.java | 2 +-
.../iot/logdispatcher/LogDispatcher.java | 12 +-
.../agent/SubscriptionBrokerAgent.java | 18 +-
.../broker/ConsensusSubscriptionBroker.java | 29 +-
.../ConsensusLogToTabletConverter.java | 135 ++-
.../consensus/ConsensusPrefetchingQueue.java | 122 +-
.../ConsensusSubscriptionCommitManager.java | 29 +-
.../ConsensusSubscriptionSetupHandler.java | 70 +-
.../SubscriptionConsensusProgress.java | 32 +-
12 files changed, 1221 insertions(+), 1294 deletions(-)
diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java
index 6c1da0199f663..ade06c96e6f8d 100644
--- a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java
+++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java
@@ -44,6 +44,10 @@
import java.util.Map;
import java.util.Properties;
import java.util.Set;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicInteger;
/** TODO: Move these manual tests into ITs */
public class ConsensusSubscriptionTableTest {
@@ -63,50 +67,32 @@ public static void main(String[] args) throws Exception {
String targetTest = args.length > 0 ? args[0] : null;
- if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) {
- runTest("testBasicDataDelivery", ConsensusSubscriptionTableTest::testBasicDataDelivery);
+ if (targetTest == null || "testBasicFlow".equals(targetTest)) {
+ runTest("testBasicFlow", ConsensusSubscriptionTableTest::testBasicFlow);
}
- if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) {
- runTest("testMultipleDataTypes", ConsensusSubscriptionTableTest::testMultipleDataTypes);
+ if (targetTest == null || "testDataTypes".equals(targetTest)) {
+ runTest("testDataTypes", ConsensusSubscriptionTableTest::testDataTypes);
}
- if (targetTest == null || "testTableLevelFiltering".equals(targetTest)) {
- runTest("testTableLevelFiltering", ConsensusSubscriptionTableTest::testTableLevelFiltering);
- }
- if (targetTest == null || "testDatabaseLevelFiltering".equals(targetTest)) {
- runTest(
- "testDatabaseLevelFiltering", ConsensusSubscriptionTableTest::testDatabaseLevelFiltering);
+ if (targetTest == null || "testPathFiltering".equals(targetTest)) {
+ runTest("testPathFiltering", ConsensusSubscriptionTableTest::testPathFiltering);
}
if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) {
runTest(
"testSubscribeBeforeRegion", ConsensusSubscriptionTableTest::testSubscribeBeforeRegion);
}
- if (targetTest == null || "testMultipleTablesAggregation".equals(targetTest)) {
- runTest(
- "testMultipleTablesAggregation",
- ConsensusSubscriptionTableTest::testMultipleTablesAggregation);
- }
- if (targetTest == null || "testMultiColumnTypes".equals(targetTest)) {
- runTest("testMultiColumnTypes", ConsensusSubscriptionTableTest::testMultiColumnTypes);
+ if (targetTest == null || "testRedelivery".equals(targetTest)) {
+ runTest("testRedelivery", ConsensusSubscriptionTableTest::testRedelivery);
}
- if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) {
- runTest("testPollWithoutCommit", ConsensusSubscriptionTableTest::testPollWithoutCommit);
+ if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) {
+ runTest("testMultiEntityIsolation", ConsensusSubscriptionTableTest::testMultiEntityIsolation);
}
- if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) {
+ if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) {
runTest(
- "testMultiConsumerGroupIndependent",
- ConsensusSubscriptionTableTest::testMultiConsumerGroupIndependent);
+ "testBurstWriteGapRecovery", ConsensusSubscriptionTableTest::testBurstWriteGapRecovery);
}
- if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) {
+ if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) {
runTest(
- "testMultiTopicSubscription", ConsensusSubscriptionTableTest::testMultiTopicSubscription);
- }
- if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) {
- runTest("testFlushDataDelivery", ConsensusSubscriptionTableTest::testFlushDataDelivery);
- }
- if (targetTest == null || "testCrossPartitionMultiWrite".equals(targetTest)) {
- runTest(
- "testCrossPartitionMultiWrite",
- ConsensusSubscriptionTableTest::testCrossPartitionMultiWrite);
+ "testCommitAfterUnsubscribe", ConsensusSubscriptionTableTest::testCommitAfterUnsubscribe);
}
// Summary
@@ -459,14 +445,20 @@ private static void assertAtLeast(String msg, int min, int actual) {
}
}
- // ============================
- // Test 1: Basic Data Delivery
- // ============================
+ // ======================================================================
+ // Test 1: Basic Flow (merged: BasicDataDelivery + MultiTables + Flush)
+ // ======================================================================
/**
- * Verifies the basic consensus subscription flow with table model: write before subscribe (not
- * received), write after subscribe (received), and no extra data beyond expectation.
+ * Verifies:
+ *
+ *
+ * Data written BEFORE subscribe is NOT received
+ * Multiple tables (t1, t2, t3) written AFTER subscribe are all received
+ * Flush does not cause data loss (WAL pinning keeps entries available)
+ * Exact row count matches expectation
+ *
*/
- private static void testBasicDataDelivery() throws Exception {
+ private static void testBasicFlow() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
@@ -474,18 +466,19 @@ private static void testBasicDataDelivery() throws Exception {
ISubscriptionTablePullConsumer consumer = null;
try {
- // Step 1: Write initial data to create DataRegion
+ // Step 1: Write initial data to create DataRegion (should NOT be received)
System.out.println(" Step 1: Writing initial data (should NOT be received)");
try (ITableSession session = openTableSession()) {
- createDatabaseAndTable(
- session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD, s2 DOUBLE FIELD");
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
+ session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)");
for (int i = 0; i < 50; i++) {
session.executeNonQueryStatement(
- String.format(
- "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)",
- i * 10, i * 1.5, i));
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
}
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)");
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
@@ -499,44 +492,60 @@ private static void testBasicDataDelivery() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- // Step 3: Write new data AFTER subscription
- System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)");
+ // Step 3: Write to 3 tables (30 rows each = 90 total), then flush
+ System.out.println(" Step 3: Writing 30 rows x 3 tables AFTER subscribe, then flush");
try (ITableSession session = openTableSession()) {
session.executeNonQueryStatement("USE " + database);
- for (int i = 100; i < 200; i++) {
+ for (int i = 100; i < 130; i++) {
session.executeNonQueryStatement(
- String.format(
- "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)",
- i * 10, i * 1.5, i));
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i));
}
+ System.out.println(" Flushing...");
+ session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- // Step 4: Poll and verify exact count
+ // Step 4: Poll and verify
System.out.println(" Step 4: Polling...");
- PollResult result = pollUntilComplete(consumer, 100, 100);
+ PollResult result = pollUntilComplete(consumer, 90, 100);
System.out.println(" Result: " + result);
- assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows);
+ assertEquals("Expected exactly 90 rows (30 per table)", 90, result.totalRows);
+ if (!result.rowsPerTable.isEmpty()) {
+ System.out.println(" Rows per table: " + result.rowsPerTable);
+ for (String tbl : new String[] {"t1", "t2", "t3"}) {
+ Integer tblRows = result.rowsPerTable.get(tbl);
+ assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0);
+ }
+ }
} finally {
cleanup(consumer, topicName, database);
}
}
- // ============================
- // Test 2: Multiple Data Types
- // ============================
+ // ======================================================================
+ // Test 2: Data Types (merged: MultipleDataTypes + MultiColumnTypes + CrossPartition)
+ // ======================================================================
/**
- * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using
- * separate INSERT statements per type (one field per INSERT), and verifies all types are
- * delivered.
+ * Verifies:
+ *
+ *
+ * Non-aligned: 6 data types via separate INSERTs
+ * All-column: 6 fields in a single INSERT
+ * Cross-partition: timestamps >1 week apart via SQL, Tablet methods
+ *
*/
- private static void testMultipleDataTypes() throws Exception {
+ private static void testDataTypes() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
String consumerId = nextConsumerId();
ISubscriptionTablePullConsumer consumer = null;
+ final long GAP = 604_800_001L; // slightly over 1 week
try {
try (ITableSession session = openTableSession()) {
@@ -548,9 +557,10 @@ private static void testMultipleDataTypes() throws Exception {
+ "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, "
+ "s_text TEXT FIELD");
session.executeNonQueryStatement("USE " + database);
- // Write initial row to create DataRegion
+ // Init row to force DataRegion creation
session.executeNonQueryStatement(
- "INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', 0, 0)");
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)");
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
@@ -562,9 +572,12 @@ private static void testMultipleDataTypes() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- System.out.println(" Writing data with 6 data types x 20 rows each");
+ int totalExpected = 0;
try (ITableSession session = openTableSession()) {
session.executeNonQueryStatement("USE " + database);
+
+ // --- Part A: 6 data types x 20 rows, separate INSERTs ---
+ System.out.println(" Part A: 6 data types x 20 rows (separate INSERTs)");
for (int i = 1; i <= 20; i++) {
session.executeNonQueryStatement(
String.format("INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', %d, %d)", i, i));
@@ -586,94 +599,115 @@ private static void testMultipleDataTypes() throws Exception {
String.format(
"INSERT INTO t1 (tag1, s_text, time) VALUES ('d1', 'text_%d', %d)", i, i));
}
- }
- Thread.sleep(2000);
+ totalExpected += 120; // 6 types x 20 rows
- System.out.println(" Polling...");
- PollResult result = pollUntilComplete(consumer, 120, 120);
- System.out.println(" Result: " + result);
+ // --- Part B: All-column rows (50 rows) ---
+ System.out.println(" Part B: 50 all-column rows");
+ for (int i = 21; i <= 70; i++) {
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)"
+ + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)",
+ i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i));
+ }
+ totalExpected += 50;
- assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows);
- System.out.println(" Seen columns: " + result.seenColumns);
- assertTrue(
- "Expected multiple column types in result, got: " + result.seenColumns,
- result.seenColumns.size() > 1);
- } finally {
- cleanup(consumer, topicName, database);
- }
- }
+ // --- Part C: Cross-partition writes ---
+ System.out.println(" Part C: Cross-partition (SQL single, multi, Tablet)");
+ long baseTs = 1_000_000_000L;
- // ============================
- // Test 3: Table-Level Filtering
- // ============================
- /**
- * Creates a topic that only matches table "t1" via TABLE_KEY. Verifies that data written to t2 is
- * NOT delivered.
- */
- private static void testTableLevelFiltering() throws Exception {
- String database = nextDatabase();
- String topicName = nextTopic();
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- ISubscriptionTablePullConsumer consumer = null;
+ // SQL single-row x2
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'xp_single_1', %d)",
+ baseTs));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'xp_single_2', %d)",
+ baseTs + GAP));
+ totalExpected += 2;
- try {
- try (ITableSession session = openTableSession()) {
- createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
- session.executeNonQueryStatement("USE " + database);
- session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
- session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
- session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
- session.executeNonQueryStatement("flush");
- }
- Thread.sleep(2000);
+ // SQL multi-row x3
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
+ + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'xp_multi_1', %d), "
+ + "('d1', 4, 400, 4.4, 4.44, false, 'xp_multi_2', %d), "
+ + "('d1', 5, 500, 5.5, 5.55, true, 'xp_multi_3', %d)",
+ baseTs + GAP * 2, baseTs + GAP * 3, baseTs + GAP * 4));
+ totalExpected += 3;
- // Topic matches only table t1
- createTopicTable(topicName, database, "t1");
- Thread.sleep(1000);
+ // Tablet x4
+ List schemaList = new ArrayList<>();
+ schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING));
+ schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32));
+ schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64));
+ schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
+ schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
+ schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
+ schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING));
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName);
- Thread.sleep(3000);
+ List categories =
+ java.util.Arrays.asList(
+ ColumnCategory.TAG,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD,
+ ColumnCategory.FIELD);
- System.out.println(" Writing to both t1 and t2 (topic filter: t1 only)");
- try (ITableSession session = openTableSession()) {
- session.executeNonQueryStatement("USE " + database);
- for (int i = 100; i < 150; i++) {
- session.executeNonQueryStatement(
- String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
- session.executeNonQueryStatement(
- String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ Tablet tablet =
+ new Tablet(
+ "t1",
+ IMeasurementSchema.getMeasurementNameList(schemaList),
+ IMeasurementSchema.getDataTypeList(schemaList),
+ categories,
+ 10);
+ for (int i = 0; i < 4; i++) {
+ int row = tablet.getRowSize();
+ long ts = baseTs + GAP * (5 + i);
+ tablet.addTimestamp(row, ts);
+ tablet.addValue("tag1", row, "d1");
+ tablet.addValue("s_int32", row, 6 + i);
+ tablet.addValue("s_int64", row, (long) (600 + i * 100));
+ tablet.addValue("s_float", row, (6 + i) * 1.1f);
+ tablet.addValue("s_double", row, (6 + i) * 2.22);
+ tablet.addValue("s_bool", row, i % 2 == 0);
+ tablet.addValue("s_text", row, "xp_tablet_" + (i + 1));
}
+ session.insert(tablet);
+ totalExpected += 4;
}
+
+ System.out.println(" Total expected rows: " + totalExpected);
Thread.sleep(2000);
- System.out.println(" Polling (expecting only t1 data)...");
- PollResult result = pollUntilComplete(consumer, 50, 60);
+ PollResult result = pollUntilComplete(consumer, totalExpected, 200);
System.out.println(" Result: " + result);
- assertEquals("Expected exactly 50 rows from t1 only", 50, result.totalRows);
- if (!result.rowsPerTable.isEmpty()) {
- Integer t2Rows = result.rowsPerTable.get("t2");
- assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0);
- Integer t1Rows = result.rowsPerTable.get("t1");
- assertAtLeast("Expected t1 rows", 1, t1Rows != null ? t1Rows : 0);
- System.out.println(
- " Table filtering verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows");
- }
+ assertAtLeast(
+ "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows);
+ assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size());
} finally {
cleanup(consumer, topicName, database);
}
}
- // ============================
- // Test 4: Database-Level Filtering
- // ============================
+ // ======================================================================
+ // Test 3: Path Filtering (merged: TableLevel + DatabaseLevel)
+ // ======================================================================
/**
- * Creates a topic that only matches database db1 via DATABASE_KEY. Verifies that data written to
- * db2 is NOT delivered.
+ * Verifies:
+ *
+ *
+ * Table-level: topic on table=t1 does NOT deliver t2 data
+ * Database-level: topic on db1 does NOT deliver db2 data
+ *
*/
- private static void testDatabaseLevelFiltering() throws Exception {
+ private static void testPathFiltering() throws Exception {
String database1 = nextDatabase();
String database2 = database1 + "_other";
String topicName = nextTopic();
@@ -683,77 +717,68 @@ private static void testDatabaseLevelFiltering() throws Exception {
try {
try (ITableSession session = openTableSession()) {
+ // db1 with t1 and t2
createDatabaseAndTable(session, database1, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
- createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
session.executeNonQueryStatement("USE " + database1);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ // db2 with t1
+ createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
session.executeNonQueryStatement("USE " + database2);
session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- // Topic matches only database1
- createTopicTable(topicName, database1, ".*");
+ // Topic: only db1, only table t1
+ createTopicTable(topicName, database1, "t1");
Thread.sleep(1000);
consumer = createConsumer(consumerId, consumerGroupId);
consumer.subscribe(topicName);
Thread.sleep(3000);
- System.out.println(
- " Writing to both "
- + database1
- + " and "
- + database2
- + " (topic filter: "
- + database1
- + " only)");
+ System.out.println(" Writing to db1.t1, db1.t2, db2.t1 (topic filter: db1.t1 only)");
try (ITableSession session = openTableSession()) {
session.executeNonQueryStatement("USE " + database1);
for (int i = 100; i < 150; i++) {
session.executeNonQueryStatement(
String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
}
session.executeNonQueryStatement("USE " + database2);
for (int i = 100; i < 150; i++) {
session.executeNonQueryStatement(
- String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i));
}
}
Thread.sleep(2000);
- System.out.println(" Polling (expecting only " + database1 + " data)...");
+ System.out.println(" Polling (expecting only db1.t1 data = 50 rows)...");
PollResult result = pollUntilComplete(consumer, 50, 60);
System.out.println(" Result: " + result);
- assertEquals("Expected exactly 50 rows from " + database1 + " only", 50, result.totalRows);
+ assertEquals("Expected exactly 50 rows from db1.t1 only", 50, result.totalRows);
+ if (!result.rowsPerTable.isEmpty()) {
+ Integer t2Rows = result.rowsPerTable.get("t2");
+ assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0);
+ System.out.println(" Table filtering verified: t1 only");
+ }
if (!result.rowsPerDatabase.isEmpty()) {
Integer db2Rows = result.rowsPerDatabase.get(database2);
- assertTrue(
- "Expected NO rows from " + database2 + ", but got " + db2Rows,
- db2Rows == null || db2Rows == 0);
- Integer db1Rows = result.rowsPerDatabase.get(database1);
- assertAtLeast("Expected " + database1 + " rows", 1, db1Rows != null ? db1Rows : 0);
- System.out.println(
- " Database filtering verified: "
- + database1
- + "="
- + db1Rows
- + " rows, "
- + database2
- + "="
- + db2Rows
- + " rows");
+ assertTrue("Expected NO rows from " + database2, db2Rows == null || db2Rows == 0);
+ System.out.println(" Database filtering verified: " + database1 + " only");
}
} finally {
cleanup(consumer, topicName, database1, database2);
}
}
- // ============================
- // Test 5: Subscribe Before Region Creation
- // ============================
+ // ======================================================================
+ // Test 4: Subscribe Before Region Creation (kept as-is)
+ // ======================================================================
/**
* Subscribe BEFORE the database/region exists, then create database and write. Tests the
* IoTConsensus.onNewPeerCreated auto-binding path with table model.
@@ -786,7 +811,7 @@ private static void testSubscribeBeforeRegion() throws Exception {
}
Thread.sleep(5000);
- System.out.println(" Step 4: Polling (auto-binding should have picked up new region)...");
+ System.out.println(" Step 4: Polling...");
PollResult result = pollUntilComplete(consumer, 100, 100);
System.out.println(" Result: " + result);
@@ -805,11 +830,11 @@ private static void testSubscribeBeforeRegion() throws Exception {
}
}
- // ============================
- // Test 6: Multiple Tables Aggregation
- // ============================
- /** Writes to t1, t2, t3 and verifies all are received via a broad topic TABLE_KEY. */
- private static void testMultipleTablesAggregation() throws Exception {
+ // ======================================================================
+ // Test 5: Redelivery / At-Least-Once (kept as-is from testPollWithoutCommit)
+ // ======================================================================
+ /** Tests at-least-once delivery with a mixed commit/no-commit pattern. */
+ private static void testRedelivery() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
@@ -820,11 +845,7 @@ private static void testMultipleTablesAggregation() throws Exception {
try (ITableSession session = openTableSession()) {
createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
session.executeNonQueryStatement("USE " + database);
- session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
- session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)");
session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
- session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
- session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)");
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
@@ -836,148 +857,6 @@ private static void testMultipleTablesAggregation() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- System.out.println(" Writing to 3 tables (t1, t2, t3), 30 rows each");
- try (ITableSession session = openTableSession()) {
- session.executeNonQueryStatement("USE " + database);
- for (int i = 100; i < 130; i++) {
- session.executeNonQueryStatement(
- String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
- session.executeNonQueryStatement(
- String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
- session.executeNonQueryStatement(
- String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i));
- }
- }
- Thread.sleep(2000);
-
- System.out.println(" Polling (expecting 90 total from 3 tables)...");
- PollResult result = pollUntilComplete(consumer, 90, 100);
- System.out.println(" Result: " + result);
-
- assertEquals("Expected exactly 90 rows total (30 per table)", 90, result.totalRows);
- if (!result.rowsPerTable.isEmpty()) {
- System.out.println(" Rows per table: " + result.rowsPerTable);
- for (String tbl : new String[] {"t1", "t2", "t3"}) {
- Integer tblRows = result.rowsPerTable.get(tbl);
- assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0);
- }
- }
- } finally {
- cleanup(consumer, topicName, database);
- }
- }
-
- // ============================
- // Test 7: Multi Column Types (Table Model Equivalent of Aligned Timeseries)
- // ============================
- /**
- * Creates a table with 6 different FIELD types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and
- * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are
- * delivered correctly. This is the table model equivalent of the aligned timeseries test.
- */
- private static void testMultiColumnTypes() throws Exception {
- String database = nextDatabase();
- String topicName = nextTopic();
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- ISubscriptionTablePullConsumer consumer = null;
-
- try {
- // Create table with multiple field types
- try (ITableSession session = openTableSession()) {
- createDatabaseAndTable(
- session,
- database,
- "t1",
- "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, "
- + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, "
- + "s_text TEXT FIELD");
- session.executeNonQueryStatement("USE " + database);
- // Write initial row to force DataRegion creation
- session.executeNonQueryStatement(
- "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
- + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)");
- session.executeNonQueryStatement("flush");
- }
- Thread.sleep(2000);
-
- createTopicTable(topicName, database, ".*");
- Thread.sleep(1000);
-
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName);
- Thread.sleep(3000);
-
- // Write 50 rows, each with all 6 data types in a single INSERT
- System.out.println(" Writing 50 rows with 6 data types per row");
- try (ITableSession session = openTableSession()) {
- session.executeNonQueryStatement("USE " + database);
- for (int i = 1; i <= 50; i++) {
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)"
- + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)",
- i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i));
- }
- }
- Thread.sleep(2000);
-
- System.out.println(" Polling...");
- PollResult result = pollUntilComplete(consumer, 50, 70);
- System.out.println(" Result: " + result);
-
- assertEquals("Expected exactly 50 rows with all field types", 50, result.totalRows);
- // Verify we see columns for multiple data types
- System.out.println(" Seen columns: " + result.seenColumns);
- assertAtLeast(
- "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size());
- } finally {
- cleanup(consumer, topicName, database);
- }
- }
-
- // ============================
- // Test 8: Poll Without Commit (Re-delivery)
- // ============================
- /**
- * Tests at-least-once delivery with a mixed commit/no-commit pattern.
- *
- * Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we
- * track committed ROWS (not events). The state machine alternates:
- *
- *
- * Even-numbered rounds: poll WITHOUT commit, record ALL timestamps from the event; next
- * poll verifies the EXACT SAME timestamps are re-delivered, then commit.
- * Odd-numbered rounds: poll and commit directly; next poll should deliver DIFFERENT data.
- *
- *
- * This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal
- * commit path in an interleaved fashion.
- */
- private static void testPollWithoutCommit() throws Exception {
- String database = nextDatabase();
- String topicName = nextTopic();
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- ISubscriptionTablePullConsumer consumer = null;
-
- try {
- try (ITableSession session = openTableSession()) {
- createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
- session.executeNonQueryStatement("USE " + database);
- session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
- session.executeNonQueryStatement("flush");
- }
- Thread.sleep(2000);
-
- createTopicTable(topicName, database, ".*");
- Thread.sleep(1000);
-
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName);
- Thread.sleep(3000);
-
- // Write 50 rows
final int totalRows = 50;
System.out.println(" Writing " + totalRows + " rows");
try (ITableSession session = openTableSession()) {
@@ -989,7 +868,6 @@ private static void testPollWithoutCommit() throws Exception {
}
Thread.sleep(3000);
- // State machine: alternate between skip-commit and direct-commit.
int totalRowsCommitted = 0;
int roundNumber = 0;
boolean hasPending = false;
@@ -1005,7 +883,6 @@ private static void testPollWithoutCommit() throws Exception {
}
for (SubscriptionMessage msg : msgs) {
- // Extract ALL timestamps from this event
List currentTimestamps = new ArrayList<>();
for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
while (ds.hasNext()) {
@@ -1015,7 +892,6 @@ private static void testPollWithoutCommit() throws Exception {
assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0);
if (hasPending) {
- // === Re-delivery round: verify EXACT same timestamps ===
assertTrue(
"Re-delivery timestamp list mismatch: expected="
+ pendingTimestamps
@@ -1036,7 +912,6 @@ private static void testPollWithoutCommit() throws Exception {
+ "] Re-delivered & committed: timestamps="
+ currentTimestamps);
} else {
- // === New event round ===
if (totalRowsCommitted > 0) {
boolean overlap = false;
for (Long ts : currentTimestamps) {
@@ -1046,12 +921,7 @@ private static void testPollWithoutCommit() throws Exception {
}
}
assertTrue(
- "After commit, should receive different data (timestamps="
- + currentTimestamps
- + " overlap with committed="
- + allCommittedTimestamps
- + ")",
- !overlap);
+ "After commit, should receive different data (overlap detected)", !overlap);
}
if (roundNumber % 2 == 0) {
@@ -1086,7 +956,6 @@ private static void testPollWithoutCommit() throws Exception {
"Should have at least 1 re-delivery round (got " + redeliveryCount + ")",
redeliveryCount > 0);
- // Final poll: should be empty
System.out.println(" Final poll: expecting no data");
int extraRows = 0;
for (int i = 0; i < 3; i++) {
@@ -1101,7 +970,6 @@ private static void testPollWithoutCommit() throws Exception {
}
}
assertEquals("After all committed, should receive no more data", 0, extraRows);
-
System.out.println(
" At-least-once re-delivery verified: "
+ totalRows
@@ -1113,16 +981,22 @@ private static void testPollWithoutCommit() throws Exception {
}
}
- // ============================
- // Test 9: Multi Consumer Group Independent Consumption
- // ============================
+ // ======================================================================
+ // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic)
+ // ======================================================================
/**
- * Two consumer groups subscribe to the same topic. Verifies that each group independently
- * receives ALL data (data is not partitioned/split between groups).
+ * Verifies:
+ *
+ *
+ * Two consumer groups on same topic: each group gets ALL data independently
+ * One consumer subscribes to two topics with different TABLE_KEY filters: each topic
+ * delivers only matching data
+ *
*/
- private static void testMultiConsumerGroupIndependent() throws Exception {
+ private static void testMultiEntityIsolation() throws Exception {
String database = nextDatabase();
- String topicName = nextTopic();
+ String topicName1 = "topic_tbl_multi_" + testCounter + "_a";
+ String topicName2 = "topic_tbl_multi_" + testCounter + "_b";
String consumerGroupId1 = "cg_tbl_multi_" + testCounter + "_a";
String consumerId1 = "consumer_tbl_multi_" + testCounter + "_a";
String consumerGroupId2 = "cg_tbl_multi_" + testCounter + "_b";
@@ -1131,163 +1005,94 @@ private static void testMultiConsumerGroupIndependent() throws Exception {
ISubscriptionTablePullConsumer consumer2 = null;
try {
- // Create database and initial data
+ // Setup: database with t1 and t2
try (ITableSession session = openTableSession()) {
createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
session.executeNonQueryStatement("USE " + database);
+ session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
+ session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- createTopicTable(topicName, database, ".*");
+ // Topic 1: covers t1 only, Topic 2: covers t2 only
+ createTopicTable(topicName1, database, "t1");
+ createTopicTable(topicName2, database, "t2");
Thread.sleep(1000);
- // Two consumers in different groups both subscribe to the same topic
+ // Consumer 1 (group A): subscribes to BOTH topics
consumer1 = createConsumer(consumerId1, consumerGroupId1);
- consumer1.subscribe(topicName);
+ consumer1.subscribe(topicName1, topicName2);
+ // Consumer 2 (group B): subscribes to BOTH topics
consumer2 = createConsumer(consumerId2, consumerGroupId2);
- consumer2.subscribe(topicName);
+ consumer2.subscribe(topicName1, topicName2);
Thread.sleep(3000);
- // Write 50 rows
- System.out.println(" Writing 50 rows");
+ // Write 30 rows to t1, 40 rows to t2
+ System.out.println(" Writing 30 rows to t1, 40 rows to t2");
try (ITableSession session = openTableSession()) {
session.executeNonQueryStatement("USE " + database);
- for (int i = 1; i <= 50; i++) {
+ for (int i = 1; i <= 40; i++) {
+ if (i <= 30) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ }
session.executeNonQueryStatement(
- String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
+ String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
}
}
Thread.sleep(2000);
- // Poll from group 1
- System.out.println(" Polling from consumer group 1...");
- PollResult result1 = pollUntilComplete(consumer1, 50, 70);
+ // Part A: Both groups should get 70 rows independently
+ System.out.println(" Part A: Multi-group isolation");
+ System.out.println(" Polling from group 1...");
+ PollResult result1 = pollUntilComplete(consumer1, 70, 80);
System.out.println(" Group 1 result: " + result1);
- // Poll from group 2
- System.out.println(" Polling from consumer group 2...");
- PollResult result2 = pollUntilComplete(consumer2, 50, 70);
+ System.out.println(" Polling from group 2...");
+ PollResult result2 = pollUntilComplete(consumer2, 70, 80);
System.out.println(" Group 2 result: " + result2);
- // Both groups should have all 50 rows
- assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows);
- assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows);
+ assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows);
+ assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows);
+
+ // Part B: Verify per-topic table isolation
+ if (!result1.rowsPerTable.isEmpty()) {
+ Integer t1Rows = result1.rowsPerTable.get("t1");
+ Integer t2Rows = result1.rowsPerTable.get("t2");
+ assertEquals("Expected 30 rows from t1 (topic1)", 30, t1Rows != null ? t1Rows : 0);
+ assertEquals("Expected 40 rows from t2 (topic2)", 40, t2Rows != null ? t2Rows : 0);
+ System.out.println(" Multi-topic isolation verified: t1=" + t1Rows + ", t2=" + t2Rows);
+ }
System.out.println(
- " Independent consumption verified: group1="
+ " Multi-group isolation verified: group1="
+ result1.totalRows
+ ", group2="
+ result2.totalRows);
} finally {
- // Clean up both consumers
if (consumer1 != null) {
try {
- consumer1.unsubscribe(topicName);
+ consumer1.unsubscribe(topicName1, topicName2);
} catch (Exception e) {
- // ignore
+ /* ignore */
}
try {
consumer1.close();
} catch (Exception e) {
- // ignore
+ /* ignore */
}
}
if (consumer2 != null) {
try {
- consumer2.unsubscribe(topicName);
+ consumer2.unsubscribe(topicName1, topicName2);
} catch (Exception e) {
- // ignore
+ /* ignore */
}
try {
consumer2.close();
} catch (Exception e) {
- // ignore
- }
- }
- dropTopicTable(topicName);
- deleteDatabase(database);
- }
- }
-
- // ============================
- // Test 10: Multi Topic Subscription
- // ============================
- /**
- * One consumer subscribes to two different topics with different TABLE_KEY filters. Verifies that
- * each topic delivers only its matching data, and no cross-contamination occurs.
- */
- private static void testMultiTopicSubscription() throws Exception {
- String database = nextDatabase();
- String topicName1 = "topic_tbl_multi_" + testCounter + "_a";
- String topicName2 = "topic_tbl_multi_" + testCounter + "_b";
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- ISubscriptionTablePullConsumer consumer = null;
-
- try {
- // Create database with two tables
- try (ITableSession session = openTableSession()) {
- createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
- session.executeNonQueryStatement("USE " + database);
- session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)");
- session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
- session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)");
- session.executeNonQueryStatement("flush");
- }
- Thread.sleep(2000);
-
- // Topic 1: covers t1 only
- createTopicTable(topicName1, database, "t1");
- // Topic 2: covers t2 only
- createTopicTable(topicName2, database, "t2");
- Thread.sleep(1000);
-
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName1, topicName2);
- Thread.sleep(3000);
-
- // Write 30 rows to t1 and 40 rows to t2
- System.out.println(" Writing 30 rows to t1, 40 rows to t2");
- try (ITableSession session = openTableSession()) {
- session.executeNonQueryStatement("USE " + database);
- for (int i = 1; i <= 40; i++) {
- if (i <= 30) {
- session.executeNonQueryStatement(
- String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
- }
- session.executeNonQueryStatement(
- String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i));
- }
- }
- Thread.sleep(2000);
-
- // Poll all data — should get t1 rows (via topic1) + t2 rows (via topic2)
- System.out.println(" Polling (expecting 30 from t1 + 40 from t2 = 70 total)...");
- PollResult result = pollUntilComplete(consumer, 70, 80);
- System.out.println(" Result: " + result);
-
- assertEquals("Expected exactly 70 rows total (30 t1 + 40 t2)", 70, result.totalRows);
- if (!result.rowsPerTable.isEmpty()) {
- Integer t1Rows = result.rowsPerTable.get("t1");
- Integer t2Rows = result.rowsPerTable.get("t2");
- assertEquals("Expected 30 rows from t1", 30, t1Rows != null ? t1Rows : 0);
- assertEquals("Expected 40 rows from t2", 40, t2Rows != null ? t2Rows : 0);
- System.out.println(
- " Multi-topic isolation verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows");
- }
- } finally {
- // Clean up consumer, both topics, and database
- if (consumer != null) {
- try {
- consumer.unsubscribe(topicName1, topicName2);
- } catch (Exception e) {
- // ignore
- }
- try {
- consumer.close();
- } catch (Exception e) {
- // ignore
+ /* ignore */
}
}
dropTopicTable(topicName1);
@@ -1296,51 +1101,40 @@ private static void testMultiTopicSubscription() throws Exception {
}
}
- // ============================
- // Test 12: Cross-Partition Multi-Write
- // ============================
+ // ======================================================================
+ // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix)
+ // ======================================================================
/**
- * Tests that cross-partition writes via all table model write methods are correctly delivered.
+ * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The
+ * pending queue overflow triggers gaps, which should be recovered from WAL.
*
- * Uses timestamps spaced >1 week apart (default partition interval = 604,800,000ms) to force
- * cross-partition distribution. Exercises three write paths:
+ *
Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one
+ * {@code pendingEntries.offer()}. A single {@code session.insert(tablet)} with N rows in one time
+ * partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To actually
+ * overflow, we need 4096+ individual write() calls arriving faster than the prefetch
+ * thread can drain. We achieve this with multiple concurrent writer threads, each performing
+ * individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate.
*
- *
- * Method 1: SQL single-row INSERT (2 rows, separate partitions)
- * Method 2: SQL multi-row INSERT (3 rows spanning 3 partitions in one statement)
- * Method 3: session.insert(Tablet) with 4 rows spanning 4 partitions
- *
+ * Note: Gap occurrence is inherently timing-dependent (race between writers and the
+ * prefetch drain loop). This test maximizes the probability by using concurrent threads, but
+ * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling
+ * from WAL" messages to confirm the gap path was exercised.
*
- *
The table has 6 FIELD columns (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) plus 1 TAG. Total
- * expected rows: 2 + 3 + 4 = 9.
- *
- *
This test verifies that when a SQL multi-row INSERT or Tablet write spans multiple time
- * partitions (causing the plan node to be split into sub-nodes for each partition), all sub-nodes
- * are correctly converted by the consensus subscription pipeline.
+ *
Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to
+ * the next prefetch iteration.
*/
- private static void testCrossPartitionMultiWrite() throws Exception {
+ private static void testBurstWriteGapRecovery() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
String consumerId = nextConsumerId();
ISubscriptionTablePullConsumer consumer = null;
- // Gap > default time partition interval (7 days = 604,800,000ms)
- final long GAP = 604_800_001L;
- final String TABLE = "t1";
- final String SCHEMA =
- "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, "
- + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, "
- + "s_text TEXT FIELD";
-
try {
- // Create database and table, write init row to force DataRegion creation
try (ITableSession session = openTableSession()) {
- createDatabaseAndTable(session, database, TABLE, SCHEMA);
+ createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD");
session.executeNonQueryStatement("USE " + database);
- session.executeNonQueryStatement(
- "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
- + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)");
+ session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)");
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
@@ -1352,123 +1146,92 @@ private static void testCrossPartitionMultiWrite() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- System.out.println(" Writing cross-partition data via 3 methods...");
+ // Use multiple concurrent writer threads with individual SQL INSERTs.
+ // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer().
+ // With N threads writing concurrently, aggregate rate should exceed drain rate
+ // and overflow the 4096-capacity queue, creating gaps.
+ final int writerThreads = 4;
+ final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096
+ final int totalRows = writerThreads * rowsPerThread;
+ final AtomicInteger errorCount = new AtomicInteger(0);
+ final CountDownLatch startLatch = new CountDownLatch(1);
+ final CountDownLatch doneLatch = new CountDownLatch(writerThreads);
- // --- Method 1: SQL single-row INSERT (2 rows, each in its own partition) ---
- long baseTs = 1_000_000_000L;
- try (ITableSession session = openTableSession()) {
- session.executeNonQueryStatement("USE " + database);
- long ts1 = baseTs;
- long ts2 = baseTs + GAP;
- System.out.println(" Method 1: SQL single-row x2 (ts=" + ts1 + ", " + ts2 + ")");
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
- + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'sql_single_1', %d)",
- ts1));
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
- + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'sql_single_2', %d)",
- ts2));
- }
-
- // --- Method 2: SQL multi-row INSERT (3 rows spanning 3 different partitions) ---
- try (ITableSession session = openTableSession()) {
- session.executeNonQueryStatement("USE " + database);
- long t1 = baseTs + GAP * 2;
- long t2 = baseTs + GAP * 3;
- long t3 = baseTs + GAP * 4;
- System.out.println(
- " Method 2: SQL multi-row x3 (ts=" + t1 + ", " + t2 + ", " + t3 + ")");
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) "
- + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'sql_multi_1', %d), "
- + "('d1', 4, 400, 4.4, 4.44, false, 'sql_multi_2', %d), "
- + "('d1', 5, 500, 5.5, 5.55, true, 'sql_multi_3', %d)",
- t1, t2, t3));
+ System.out.println(
+ " Burst writing "
+ + totalRows
+ + " rows via "
+ + writerThreads
+ + " concurrent threads ("
+ + rowsPerThread
+ + " individual SQL INSERTs each)");
+ System.out.println(
+ " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)");
+
+ ExecutorService executor = Executors.newFixedThreadPool(writerThreads);
+ for (int t = 0; t < writerThreads; t++) {
+ final int threadId = t;
+ final int startTs = threadId * rowsPerThread + 1;
+ executor.submit(
+ () -> {
+ try {
+ startLatch.await(); // all threads start at the same time
+ try (ITableSession session = openTableSession()) {
+ session.executeNonQueryStatement("USE " + database);
+ for (int i = 0; i < rowsPerThread; i++) {
+ int ts = startTs + i;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)",
+ (long) ts * 10, ts));
+ }
+ }
+ } catch (Exception e) {
+ System.out.println(" Writer thread " + threadId + " error: " + e.getMessage());
+ errorCount.incrementAndGet();
+ } finally {
+ doneLatch.countDown();
+ }
+ });
}
- // --- Method 3: session.insert(Tablet) with 4 rows spanning 4 partitions ---
- try (ITableSession session = openTableSession()) {
- session.executeNonQueryStatement("USE " + database);
-
- List schemaList = new ArrayList<>();
- schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING));
- schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32));
- schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64));
- schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
- schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
- schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
- schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING));
-
- List categories =
- java.util.Arrays.asList(
- ColumnCategory.TAG,
- ColumnCategory.FIELD,
- ColumnCategory.FIELD,
- ColumnCategory.FIELD,
- ColumnCategory.FIELD,
- ColumnCategory.FIELD,
- ColumnCategory.FIELD);
-
- Tablet tablet =
- new Tablet(
- TABLE,
- IMeasurementSchema.getMeasurementNameList(schemaList),
- IMeasurementSchema.getDataTypeList(schemaList),
- categories,
- 10);
+ // Fire all threads simultaneously
+ startLatch.countDown();
+ doneLatch.await();
+ executor.shutdown();
- for (int i = 0; i < 4; i++) {
- int row = tablet.getRowSize();
- long ts = baseTs + GAP * (5 + i); // partitions 5, 6, 7, 8
- tablet.addTimestamp(row, ts);
- tablet.addValue("tag1", row, "d1");
- tablet.addValue("s_int32", row, 6 + i);
- tablet.addValue("s_int64", row, (long) (600 + i * 100));
- tablet.addValue("s_float", row, (6 + i) * 1.1f);
- tablet.addValue("s_double", row, (6 + i) * 2.22);
- tablet.addValue("s_bool", row, i % 2 == 0);
- tablet.addValue("s_text", row, "tablet_" + (i + 1));
- }
- System.out.println(
- " Method 3: Tablet x4 (ts=" + (baseTs + GAP * 5) + ".." + (baseTs + GAP * 8) + ")");
- session.insert(tablet);
+ if (errorCount.get() > 0) {
+ System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors");
}
- Thread.sleep(2000);
-
- // Poll — expect 9 rows total (2 + 3 + 4)
- final int expectedRows = 9;
- System.out.println(" Polling (expecting " + expectedRows + " rows)...");
- PollResult result = pollUntilComplete(consumer, expectedRows, 80);
+ // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes
+ System.out.println(
+ " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)...");
+ System.out.println(
+ " (Check server logs for 'gap detected' to confirm gap recovery was triggered)");
+ PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true);
System.out.println(" Result: " + result);
assertEquals(
- "Expected exactly " + expectedRows + " cross-partition rows",
- expectedRows,
+ "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)",
+ totalRows,
result.totalRows);
- // Verify we see all 6 FIELD columns plus tag
- assertAtLeast(
- "Expected at least 6 data columns in cross-partition result",
- 6,
- result.seenColumns.size());
} finally {
cleanup(consumer, topicName, database);
}
}
- // ============================
- // Test 11: Flush Data Delivery
- // ============================
+ // ======================================================================
+ // Test 8: Commit After Unsubscribe (NEW — tests H7 fix)
+ // ======================================================================
/**
- * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable
- * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps
- * entries available until committed by the subscription consumer.
+ * Tests that commit still works correctly after the consumer has unsubscribed (queue has been
+ * torn down). The commit routing should use metadata-based topic config check instead of runtime
+ * queue state.
+ *
+ * Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue().
*/
- private static void testFlushDataDelivery() throws Exception {
+ private static void testCommitAfterUnsubscribe() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
@@ -1491,26 +1254,76 @@ private static void testFlushDataDelivery() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- // Write 50 rows, then flush before polling
- System.out.println(" Writing 50 rows then flushing");
+ // Write data
+ System.out.println(" Writing 50 rows");
try (ITableSession session = openTableSession()) {
session.executeNonQueryStatement("USE " + database);
for (int i = 1; i <= 50; i++) {
session.executeNonQueryStatement(
String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i));
}
- System.out.println(" Flushing...");
- session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- // Poll — all 50 rows should be delivered despite flush
- System.out.println(" Polling after flush...");
- PollResult result = pollUntilComplete(consumer, 50, 70);
- System.out.println(" Result: " + result);
- assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows);
+ // Poll WITHOUT commit
+ System.out.println(" Polling WITHOUT commit...");
+ List uncommittedMessages = new ArrayList<>();
+ int polledRows = 0;
+ for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ if (msgs.isEmpty()) {
+ if (polledRows > 0) break;
+ Thread.sleep(500);
+ continue;
+ }
+ for (SubscriptionMessage msg : msgs) {
+ uncommittedMessages.add(msg);
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ polledRows++;
+ }
+ }
+ }
+ }
+ System.out.println(
+ " Polled "
+ + polledRows
+ + " rows, holding "
+ + uncommittedMessages.size()
+ + " uncommitted messages");
+ assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows);
+
+ // Unsubscribe (tears down the consensus queue)
+ System.out.println(" Unsubscribing (queue teardown)...");
+ consumer.unsubscribe(topicName);
+ Thread.sleep(2000);
+
+ // Now commit the previously polled messages — should NOT throw
+ System.out.println(
+ " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe...");
+ boolean commitSucceeded = true;
+ for (SubscriptionMessage msg : uncommittedMessages) {
+ try {
+ consumer.commitSync(msg);
+ } catch (Exception e) {
+ System.out.println(" Commit threw exception: " + e.getMessage());
+ commitSucceeded = false;
+ }
+ }
+
+ System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded);
+ System.out.println(" (Key: no exception crash, routing handled gracefully)");
} finally {
- cleanup(consumer, topicName, database);
+ if (consumer != null) {
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ dropTopicTable(topicName);
+ deleteDatabase(database);
}
}
}
diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java
index 1ab7a910c0324..501b789edd738 100644
--- a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java
+++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java
@@ -43,6 +43,10 @@
import java.util.Map;
import java.util.Properties;
import java.util.Set;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicInteger;
/** TODO: move these manual tests into ITs */
public class ConsensusSubscriptionTest {
@@ -62,46 +66,29 @@ public static void main(String[] args) throws Exception {
String targetTest = args.length > 0 ? args[0] : null;
- if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) {
- runTest("testBasicDataDelivery", ConsensusSubscriptionTest::testBasicDataDelivery);
+ if (targetTest == null || "testBasicFlow".equals(targetTest)) {
+ runTest("testBasicFlow", ConsensusSubscriptionTest::testBasicFlow);
}
- if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) {
- runTest("testMultipleDataTypes", ConsensusSubscriptionTest::testMultipleDataTypes);
+ if (targetTest == null || "testDataTypes".equals(targetTest)) {
+ runTest("testDataTypes", ConsensusSubscriptionTest::testDataTypes);
}
- if (targetTest == null || "testDeviceLevelFiltering".equals(targetTest)) {
- runTest("testDeviceLevelFiltering", ConsensusSubscriptionTest::testDeviceLevelFiltering);
- }
- if (targetTest == null || "testTimeseriesLevelFiltering".equals(targetTest)) {
- runTest(
- "testTimeseriesLevelFiltering", ConsensusSubscriptionTest::testTimeseriesLevelFiltering);
+ if (targetTest == null || "testPathFiltering".equals(targetTest)) {
+ runTest("testPathFiltering", ConsensusSubscriptionTest::testPathFiltering);
}
if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) {
runTest("testSubscribeBeforeRegion", ConsensusSubscriptionTest::testSubscribeBeforeRegion);
}
- if (targetTest == null || "testMultipleDevicesAggregation".equals(targetTest)) {
- runTest(
- "testMultipleDevicesAggregation",
- ConsensusSubscriptionTest::testMultipleDevicesAggregation);
- }
- if (targetTest == null || "testAlignedTimeseries".equals(targetTest)) {
- runTest("testAlignedTimeseries", ConsensusSubscriptionTest::testAlignedTimeseries);
- }
- if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) {
- runTest("testPollWithoutCommit", ConsensusSubscriptionTest::testPollWithoutCommit);
- }
- if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) {
- runTest(
- "testMultiConsumerGroupIndependent",
- ConsensusSubscriptionTest::testMultiConsumerGroupIndependent);
+ if (targetTest == null || "testRedelivery".equals(targetTest)) {
+ runTest("testRedelivery", ConsensusSubscriptionTest::testRedelivery);
}
- if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) {
- runTest("testMultiTopicSubscription", ConsensusSubscriptionTest::testMultiTopicSubscription);
+ if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) {
+ runTest("testMultiEntityIsolation", ConsensusSubscriptionTest::testMultiEntityIsolation);
}
- if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) {
- runTest("testFlushDataDelivery", ConsensusSubscriptionTest::testFlushDataDelivery);
+ if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) {
+ runTest("testBurstWriteGapRecovery", ConsensusSubscriptionTest::testBurstWriteGapRecovery);
}
- if (targetTest == null || "testCrossPartitionAligned".equals(targetTest)) {
- runTest("testCrossPartitionAligned", ConsensusSubscriptionTest::testCrossPartitionAligned);
+ if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) {
+ runTest("testCommitAfterUnsubscribe", ConsensusSubscriptionTest::testCommitAfterUnsubscribe);
}
// Summary
@@ -407,14 +394,20 @@ private static void assertAtLeast(String msg, int min, int actual) {
}
}
- // ============================
- // Test 1: Basic Data Delivery
- // ============================
+ // ======================================================================
+ // Test 1: Basic Flow (merged: BasicDataDelivery + MultiDevices + Flush)
+ // ======================================================================
/**
- * Verifies the basic consensus subscription flow: write before subscribe (not received), write
- * after subscribe (received), and no extra data beyond expectation.
+ * Verifies:
+ *
+ *
+ * Data written BEFORE subscribe is NOT received
+ * Multiple devices (d1, d2, d3) written AFTER subscribe are all received
+ * Flush does not cause data loss (WAL pinning keeps entries available)
+ * Exact row count matches expectation
+ *
*/
- private static void testBasicDataDelivery() throws Exception {
+ private static void testBasicFlow() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
@@ -422,16 +415,19 @@ private static void testBasicDataDelivery() throws Exception {
SubscriptionTreePullConsumer consumer = null;
try {
- // Step 1: Write initial data to create DataRegion
+ // Step 1: Write initial data to create DataRegion (should NOT be received)
System.out.println(" Step 1: Writing initial data (should NOT be received)");
try (ISession session = openSession()) {
createDatabase(session, database);
for (int i = 0; i < 50; i++) {
session.executeNonQueryStatement(
- String.format(
- "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)",
- database, i, i * 10, i * 1.5));
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
}
+ // Also write to d2, d3 for multi-device readiness
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database));
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
@@ -445,48 +441,79 @@ private static void testBasicDataDelivery() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- // Step 3: Write new data AFTER subscription
- System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)");
+ // Step 3: Write to 3 devices (30 rows each = 90 total), then flush
+ System.out.println(" Step 3: Writing 30 rows x 3 devices AFTER subscribe, then flush");
try (ISession session = openSession()) {
- for (int i = 100; i < 200; i++) {
+ for (int i = 100; i < 130; i++) {
session.executeNonQueryStatement(
- String.format(
- "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)",
- database, i, i * 10, i * 1.5));
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30));
}
+ System.out.println(" Flushing...");
+ session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- // Step 4: Poll and verify exact count (also verifies no extra data)
+ // Step 4: Poll and verify
System.out.println(" Step 4: Polling...");
- PollResult result = pollUntilComplete(consumer, 100, 100);
+ PollResult result = pollUntilComplete(consumer, 90, 100);
System.out.println(" Result: " + result);
- assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows);
+ assertEquals("Expected exactly 90 rows (30 per device)", 90, result.totalRows);
+ if (!result.rowsPerDevice.isEmpty()) {
+ System.out.println(" Rows per device: " + result.rowsPerDevice);
+ for (String dev : new String[] {"d1", "d2", "d3"}) {
+ Integer devRows = result.rowsPerDevice.get(database + "." + dev);
+ assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0);
+ }
+ }
} finally {
cleanup(consumer, topicName, database);
}
}
- // ============================
- // Test 2: Multiple Data Types (Non-Aligned)
- // ============================
+ // ======================================================================
+ // Test 2: Data Types (merged: MultipleDataTypes + Aligned + CrossPartition)
+ // ======================================================================
/**
- * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using
- * separate INSERT statements per type (non-aligned), and verifies all types are delivered.
+ * Verifies:
+ *
+ *
+ * Non-aligned: 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT)
+ * Aligned: 6 data types, cross-partition timestamps (>1 week apart)
+ * 6 write methods: SQL single/multi-row, insertAlignedRecord/Records/Tablet/Tablets
+ *
*/
- private static void testMultipleDataTypes() throws Exception {
+ private static void testDataTypes() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
String consumerId = nextConsumerId();
SubscriptionTreePullConsumer consumer = null;
+ final long GAP = 604_800_001L; // slightly over 1 week
try {
try (ISession session = openSession()) {
createDatabase(session, database);
+ // Create aligned timeseries
+ session.executeNonQueryStatement(
+ String.format(
+ "CREATE ALIGNED TIMESERIES %s.d_aligned"
+ + "(s_int32 INT32, s_int64 INT64, s_float FLOAT,"
+ + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)",
+ database));
+ // Init rows to force DataRegion creation
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s_int32) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')",
+ database));
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
@@ -498,8 +525,29 @@ private static void testMultipleDataTypes() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- System.out.println(" Writing data with 6 data types x 20 rows each");
+ int totalExpected = 0;
+ final String device = database + ".d_aligned";
+ List measurements =
+ Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text");
+ List types =
+ Arrays.asList(
+ TSDataType.INT32,
+ TSDataType.INT64,
+ TSDataType.FLOAT,
+ TSDataType.DOUBLE,
+ TSDataType.BOOLEAN,
+ TSDataType.TEXT);
+ List schemas = new ArrayList<>();
+ schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32));
+ schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64));
+ schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
+ schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
+ schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
+ schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT));
+
try (ISession session = openSession()) {
+ // --- Part A: Non-aligned, 6 types x 20 rows ---
+ System.out.println(" Part A: Non-aligned 6 data types x 20 rows");
for (int i = 1; i <= 20; i++) {
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s_int32) VALUES (%d, %d)", database, i, i));
@@ -521,93 +569,103 @@ private static void testMultipleDataTypes() throws Exception {
String.format(
"INSERT INTO %s.d1(time, s_text) VALUES (%d, 'text_%d')", database, i, i));
}
- }
- Thread.sleep(2000);
-
- System.out.println(" Polling...");
- PollResult result = pollUntilComplete(consumer, 120, 120);
- System.out.println(" Result: " + result);
-
- assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows);
- System.out.println(" Seen columns: " + result.seenColumns);
- assertTrue(
- "Expected multiple column types in result, got: " + result.seenColumns,
- result.seenColumns.size() > 1);
- } finally {
- cleanup(consumer, topicName, database);
- }
- }
+ totalExpected += 120; // 6 types x 20 rows
- // ============================
- // Test 3: Device-Level Filtering
- // ============================
- /**
- * Creates a topic that only matches root.db.d1.** and verifies that data written to d2 is NOT
- * delivered.
- */
- private static void testDeviceLevelFiltering() throws Exception {
- String database = nextDatabase();
- String topicName = nextTopic();
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- SubscriptionTreePullConsumer consumer = null;
+ // --- Part B: Aligned cross-partition, 6 write methods ---
+ System.out.println(" Part B: Aligned cross-partition, 6 write methods");
- try {
- try (ISession session = openSession()) {
- createDatabase(session, database);
+ // Method 1: SQL single row
+ long t1 = 1;
session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')",
+ database, t1));
+ totalExpected += 1;
+
+ // Method 2: SQL multi-row (cross-partition)
+ long t2a = 1 + GAP;
+ long t2b = 1 + 2 * GAP;
session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
- session.executeNonQueryStatement("flush");
- }
- Thread.sleep(2000);
+ String.format(
+ "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
+ + " s_double, s_bool, s_text)"
+ + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a'),"
+ + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')",
+ database, t2a, t2b));
+ totalExpected += 2;
- String filterPath = database + ".d1.**";
- createTopic(topicName, filterPath);
- Thread.sleep(1000);
+ // Method 3: insertAlignedRecord
+ long t3 = 1 + 3 * GAP;
+ session.insertAlignedRecord(
+ device,
+ t3,
+ measurements,
+ types,
+ Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single"));
+ totalExpected += 1;
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName);
- Thread.sleep(3000);
+ // Method 4: insertAlignedRecordsOfOneDevice (cross-partition)
+ long t4a = 1 + 4 * GAP;
+ long t4b = 1 + 5 * GAP;
+ session.insertAlignedRecordsOfOneDevice(
+ device,
+ Arrays.asList(t4a, t4b),
+ Arrays.asList(measurements, measurements),
+ Arrays.asList(types, types),
+ Arrays.asList(
+ Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"),
+ Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b")));
+ totalExpected += 2;
- System.out.println(" Writing to both d1 and d2 (topic filter: d1.** only)");
- try (ISession session = openSession()) {
- for (int i = 100; i < 150; i++) {
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
- }
+ // Method 5: insertAlignedTablet (cross-partition)
+ long t5a = 1 + 6 * GAP;
+ long t5b = 1 + 7 * GAP;
+ Tablet tablet5 = new Tablet(device, schemas, 2);
+ addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a");
+ addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b");
+ session.insertAlignedTablet(tablet5);
+ totalExpected += 2;
+
+ // Method 6: insertAlignedTablets (cross-partition)
+ long t6a = 1 + 8 * GAP;
+ long t6b = 1 + 9 * GAP;
+ Tablet tablet6 = new Tablet(device, schemas, 2);
+ addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a");
+ addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b");
+ Map tabletMap = new HashMap<>();
+ tabletMap.put(device, tablet6);
+ session.insertAlignedTablets(tabletMap);
+ totalExpected += 2;
}
+
+ System.out.println(" Total expected rows: " + totalExpected);
Thread.sleep(2000);
- System.out.println(" Polling (expecting only d1 data)...");
- PollResult result = pollUntilComplete(consumer, 50, 60);
+ PollResult result = pollUntilComplete(consumer, totalExpected, 150);
System.out.println(" Result: " + result);
- assertEquals("Expected exactly 50 rows from d1 only", 50, result.totalRows);
- if (!result.rowsPerDevice.isEmpty()) {
- Integer d2Rows = result.rowsPerDevice.get(database + ".d2");
- assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0);
- Integer d1Rows = result.rowsPerDevice.get(database + ".d1");
- assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0);
- System.out.println(
- " Device filtering verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows");
- }
+ assertAtLeast(
+ "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows);
+ assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size());
} finally {
cleanup(consumer, topicName, database);
}
}
- // ============================
- // Test 4: Timeseries-Level Filtering
- // ============================
+ // ======================================================================
+ // Test 3: Path Filtering (merged: DeviceLevel + TimeseriesLevel)
+ // ======================================================================
/**
- * Creates a topic matching root.db.d1.s1 only. Tests whether the converter filters at measurement
- * level. Lenient: if both s1 and s2 arrive, reports device-level-only filtering.
+ * Verifies:
+ *
+ *
+ * Device-level: topic on d1.** does NOT deliver d2 data
+ * Timeseries-level: topic on d1.s1 — lenient check for s2 filtering
+ *
*/
- private static void testTimeseriesLevelFiltering() throws Exception {
+ private static void testPathFiltering() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
@@ -619,10 +677,13 @@ private static void testTimeseriesLevelFiltering() throws Exception {
createDatabase(session, database);
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s1, s2) VALUES (0, 0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
+ // Topic filters d1.s1 only (timeseries-level)
String filterPath = database + ".d1.s1";
createTopic(topicName, filterPath);
Thread.sleep(1000);
@@ -631,39 +692,50 @@ private static void testTimeseriesLevelFiltering() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- System.out.println(" Writing to d1.s1 and d1.s2 (topic filter: d1.s1 only)");
+ System.out.println(" Writing to d1 (s1 + s2) and d2 (s1)");
try (ISession session = openSession()) {
for (int i = 100; i < 150; i++) {
session.executeNonQueryStatement(
String.format(
"INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %d)",
database, i, i * 10, i * 20));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 30));
}
}
Thread.sleep(2000);
- System.out.println(" Polling (expecting only s1 data)...");
+ System.out.println(" Polling (expecting d1 data only, ideally s1 only)...");
PollResult result = pollUntilComplete(consumer, 50, 60);
System.out.println(" Result: " + result);
- System.out.println(" Seen columns: " + result.seenColumns);
+ // Device-level: d2 must NOT appear
+ if (!result.rowsPerDevice.isEmpty()) {
+ Integer d2Rows = result.rowsPerDevice.get(database + ".d2");
+ assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0);
+ Integer d1Rows = result.rowsPerDevice.get(database + ".d1");
+ assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0);
+ System.out.println(" Device filtering verified: d1=" + d1Rows + ", d2=" + d2Rows);
+ }
+
+ // Timeseries-level: lenient check
boolean hasS2 = result.seenColumns.stream().anyMatch(c -> c.contains(".s2"));
if (hasS2) {
System.out.println(
" INFO: Both s1 and s2 received — converter uses device-level filtering only.");
- assertAtLeast("Should have received some rows", 50, result.totalRows);
+ assertAtLeast("Should have received d1 rows", 50, result.totalRows);
} else {
System.out.println(" Timeseries-level filtering verified: only s1 data received");
- assertEquals("Expected exactly 50 rows from s1 only", 50, result.totalRows);
+ assertEquals("Expected exactly 50 rows from d1.s1 only", 50, result.totalRows);
}
} finally {
cleanup(consumer, topicName, database);
}
}
- // ============================
- // Test 5: Subscribe Before Region Creation
- // ============================
+ // ======================================================================
+ // Test 4: Subscribe Before Region Creation (kept as-is)
+ // ======================================================================
/**
* Subscribe BEFORE the database/region exists, then create database and write. Tests the
* IoTConsensus.onNewPeerCreated auto-binding path.
@@ -695,7 +767,7 @@ private static void testSubscribeBeforeRegion() throws Exception {
}
Thread.sleep(5000);
- System.out.println(" Step 4: Polling (auto-binding should have picked up new region)...");
+ System.out.println(" Step 4: Polling...");
PollResult result = pollUntilComplete(consumer, 100, 100);
System.out.println(" Result: " + result);
@@ -714,11 +786,20 @@ private static void testSubscribeBeforeRegion() throws Exception {
}
}
- // ============================
- // Test 6: Multiple Devices Aggregation
- // ============================
- /** Writes to d1, d2, d3 and verifies all are received via a broad topic path. */
- private static void testMultipleDevicesAggregation() throws Exception {
+ // ======================================================================
+ // Test 5: Redelivery / At-Least-Once (kept as-is from testPollWithoutCommit)
+ // ======================================================================
+ /**
+ * Tests at-least-once delivery with a mixed commit/no-commit pattern.
+ *
+ * Writes 50 rows. Alternates between:
+ *
+ *
+ * Even rounds: poll WITHOUT commit → next poll verifies same timestamps → commit
+ * Odd rounds: poll and commit directly → next poll should deliver DIFFERENT data
+ *
+ */
+ private static void testRedelivery() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
@@ -730,10 +811,6 @@ private static void testMultipleDevicesAggregation() throws Exception {
createDatabase(session, database);
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database));
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
@@ -745,194 +822,41 @@ private static void testMultipleDevicesAggregation() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- System.out.println(" Writing to 3 devices (d1, d2, d3), 30 rows each");
+ final int totalRows = 50;
+ System.out.println(" Writing " + totalRows + " rows");
try (ISession session = openSession()) {
- for (int i = 100; i < 130; i++) {
+ for (int i = 1; i <= totalRows; i++) {
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30));
}
}
- Thread.sleep(2000);
+ Thread.sleep(3000);
- System.out.println(" Polling (expecting 90 total from 3 devices)...");
- PollResult result = pollUntilComplete(consumer, 90, 100);
- System.out.println(" Result: " + result);
+ int totalRowsCommitted = 0;
+ int roundNumber = 0;
+ boolean hasPending = false;
+ List pendingTimestamps = new ArrayList<>();
+ Set allCommittedTimestamps = new HashSet<>();
+ int redeliveryCount = 0;
- assertEquals("Expected exactly 90 rows total (30 per device)", 90, result.totalRows);
- if (!result.rowsPerDevice.isEmpty()) {
- System.out.println(" Rows per device: " + result.rowsPerDevice);
- for (String dev : new String[] {"d1", "d2", "d3"}) {
- Integer devRows = result.rowsPerDevice.get(database + "." + dev);
- assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0);
+ for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(5000));
+ if (msgs.isEmpty()) {
+ Thread.sleep(1000);
+ continue;
}
- }
- } finally {
- cleanup(consumer, topicName, database);
- }
- }
- // ============================
- // Test 7: Aligned Timeseries
- // ============================
- /**
- * Creates aligned timeseries with 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and
- * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are
- * delivered correctly.
- */
- private static void testAlignedTimeseries() throws Exception {
- String database = nextDatabase();
- String topicName = nextTopic();
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- SubscriptionTreePullConsumer consumer = null;
-
- try {
- // Create aligned timeseries with multiple data types
- try (ISession session = openSession()) {
- createDatabase(session, database);
- session.executeNonQueryStatement(
- String.format(
- "CREATE ALIGNED TIMESERIES %s.d_aligned"
- + "(s_int32 INT32, s_int64 INT64, s_float FLOAT,"
- + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)",
- database));
- // Write initial row to force DataRegion creation
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
- + " s_double, s_bool, s_text)"
- + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')",
- database));
- session.executeNonQueryStatement("flush");
- }
- Thread.sleep(2000);
-
- createTopic(topicName, database + ".**");
- Thread.sleep(1000);
-
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName);
- Thread.sleep(3000);
-
- // Write 50 aligned rows, each with all 6 data types in a single INSERT
- System.out.println(" Writing 50 aligned rows with 6 data types per row");
- try (ISession session = openSession()) {
- for (int i = 1; i <= 50; i++) {
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
- + " s_double, s_bool, s_text)"
- + " VALUES (%d, %d, %d, %f, %f, %s, 'text_%d')",
- database,
- i,
- i,
- (long) i * 100000L,
- i * 1.1f,
- i * 2.2,
- i % 2 == 0 ? "true" : "false",
- i));
- }
- }
- Thread.sleep(2000);
-
- System.out.println(" Polling...");
- PollResult result = pollUntilComplete(consumer, 50, 70);
- System.out.println(" Result: " + result);
-
- assertEquals("Expected exactly 50 aligned rows", 50, result.totalRows);
- // Verify we see columns for multiple data types
- System.out.println(" Seen columns: " + result.seenColumns);
- assertAtLeast(
- "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size());
- } finally {
- cleanup(consumer, topicName, database);
- }
- }
-
- // ============================
- // Test 8: Poll Without Commit (Re-delivery)
- // ============================
- /**
- * Tests at-least-once delivery with a mixed commit/no-commit pattern.
- *
- * Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we
- * track committed ROWS (not events). The state machine alternates:
- *
- *
- * Even-numbered rounds: poll WITHOUT commit, record ALL timestamps from the event; next
- * poll verifies the EXACT SAME timestamps are re-delivered, then commit.
- * Odd-numbered rounds: poll and commit directly; next poll should deliver DIFFERENT data.
- *
- *
- * This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal
- * commit path in an interleaved fashion.
- */
- private static void testPollWithoutCommit() throws Exception {
- String database = nextDatabase();
- String topicName = nextTopic();
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- SubscriptionTreePullConsumer consumer = null;
-
- try {
- try (ISession session = openSession()) {
- createDatabase(session, database);
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
- session.executeNonQueryStatement("flush");
- }
- Thread.sleep(2000);
-
- createTopic(topicName, database + ".**");
- Thread.sleep(1000);
-
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName);
- Thread.sleep(3000);
-
- // Write 50 rows (may be batched into fewer events by the prefetching thread)
- final int totalRows = 50;
- System.out.println(" Writing " + totalRows + " rows");
- try (ISession session = openSession()) {
- for (int i = 1; i <= totalRows; i++) {
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
- }
- }
- Thread.sleep(3000);
-
- // State machine: alternate between skip-commit and direct-commit.
- // Track committed ROWS (not events) because batching is unpredictable.
- int totalRowsCommitted = 0;
- int roundNumber = 0; // counts distinct events seen (used for alternation)
- boolean hasPending = false;
- List pendingTimestamps = new ArrayList<>(); // timestamps from the uncommitted event
- Set allCommittedTimestamps = new HashSet<>(); // all timestamps ever committed
- int redeliveryCount = 0;
-
- for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) {
- List msgs = consumer.poll(Duration.ofMillis(5000));
- if (msgs.isEmpty()) {
- Thread.sleep(1000);
- continue;
- }
-
- for (SubscriptionMessage msg : msgs) {
- // Extract ALL timestamps from this event (may contain multiple rows)
- List currentTimestamps = new ArrayList<>();
- for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
- while (ds.hasNext()) {
- currentTimestamps.add(ds.next().getTimestamp());
- }
- }
- assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0);
+ for (SubscriptionMessage msg : msgs) {
+ List currentTimestamps = new ArrayList<>();
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ currentTimestamps.add(ds.next().getTimestamp());
+ }
+ }
+ assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0);
if (hasPending) {
- // === Re-delivery round: verify EXACT same timestamps ===
+ // Re-delivery round: verify EXACT same timestamps
assertTrue(
"Re-delivery timestamp list mismatch: expected="
+ pendingTimestamps
@@ -953,8 +877,7 @@ private static void testPollWithoutCommit() throws Exception {
+ "] Re-delivered & committed: timestamps="
+ currentTimestamps);
} else {
- // === New event round ===
- // After a commit, verify this is DIFFERENT data (no overlap with committed set)
+ // New event round
if (totalRowsCommitted > 0) {
boolean overlap = false;
for (Long ts : currentTimestamps) {
@@ -964,16 +887,9 @@ private static void testPollWithoutCommit() throws Exception {
}
}
assertTrue(
- "After commit, should receive different data (timestamps="
- + currentTimestamps
- + " overlap with committed="
- + allCommittedTimestamps
- + ")",
- !overlap);
+ "After commit, should receive different data (overlap detected)", !overlap);
}
- // Even-numbered rounds: skip commit (test re-delivery)
- // Odd-numbered rounds: commit directly (test normal flow)
if (roundNumber % 2 == 0) {
pendingTimestamps = new ArrayList<>(currentTimestamps);
hasPending = true;
@@ -1021,7 +937,6 @@ private static void testPollWithoutCommit() throws Exception {
}
}
assertEquals("After all committed, should receive no more data", 0, extraRows);
-
System.out.println(
" At-least-once re-delivery verified: "
+ totalRows
@@ -1033,16 +948,22 @@ private static void testPollWithoutCommit() throws Exception {
}
}
- // ============================
- // Test 9: Multi Consumer Group Independent Consumption
- // ============================
+ // ======================================================================
+ // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic)
+ // ======================================================================
/**
- * Two consumer groups subscribe to the same topic. Verifies that each group independently
- * receives ALL data (data is not partitioned/split between groups).
+ * Verifies:
+ *
+ *
+ * Two consumer groups on same topic: each group gets ALL data independently
+ * One consumer subscribes to two topics with different path filters: each topic delivers
+ * only matching data
+ *
*/
- private static void testMultiConsumerGroupIndependent() throws Exception {
+ private static void testMultiEntityIsolation() throws Exception {
String database = nextDatabase();
- String topicName = nextTopic();
+ String topicName1 = "topic_multi_" + testCounter + "_a";
+ String topicName2 = "topic_multi_" + testCounter + "_b";
String consumerGroupId1 = "cg_multi_" + testCounter + "_a";
String consumerId1 = "consumer_multi_" + testCounter + "_a";
String consumerGroupId2 = "cg_multi_" + testCounter + "_b";
@@ -1051,178 +972,231 @@ private static void testMultiConsumerGroupIndependent() throws Exception {
SubscriptionTreePullConsumer consumer2 = null;
try {
- // Create database and initial data
+ // Setup: database with d1 and d2
try (ISession session = openSession()) {
createDatabase(session, database);
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- createTopic(topicName, database + ".**");
+ // Topic 1: covers d1 only, Topic 2: covers d2 only
+ createTopic(topicName1, database + ".d1.**");
+ createTopic(topicName2, database + ".d2.**");
Thread.sleep(1000);
- // Two consumers in different groups both subscribe to the same topic
+ // Consumer 1 (group A): subscribes to BOTH topics
consumer1 = createConsumer(consumerId1, consumerGroupId1);
- consumer1.subscribe(topicName);
+ consumer1.subscribe(topicName1, topicName2);
+ // Consumer 2 (group B): subscribes to BOTH topics
consumer2 = createConsumer(consumerId2, consumerGroupId2);
- consumer2.subscribe(topicName);
+ consumer2.subscribe(topicName1, topicName2);
Thread.sleep(3000);
- // Write 50 rows
- System.out.println(" Writing 50 rows");
+ // Write 30 rows to d1, 40 rows to d2
+ System.out.println(" Writing 30 rows to d1, 40 rows to d2");
try (ISession session = openSession()) {
- for (int i = 1; i <= 50; i++) {
+ for (int i = 1; i <= 40; i++) {
+ if (i <= 30) {
+ session.executeNonQueryStatement(
+ String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ }
session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
+ String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
}
}
Thread.sleep(2000);
- // Poll from group 1
- System.out.println(" Polling from consumer group 1...");
- PollResult result1 = pollUntilComplete(consumer1, 50, 70);
+ // Part A: Both groups should get 70 rows independently
+ System.out.println(" Part A: Multi-group isolation");
+ System.out.println(" Polling from group 1...");
+ PollResult result1 = pollUntilComplete(consumer1, 70, 80);
System.out.println(" Group 1 result: " + result1);
- // Poll from group 2
- System.out.println(" Polling from consumer group 2...");
- PollResult result2 = pollUntilComplete(consumer2, 50, 70);
+ System.out.println(" Polling from group 2...");
+ PollResult result2 = pollUntilComplete(consumer2, 70, 80);
System.out.println(" Group 2 result: " + result2);
- // Both groups should have all 50 rows
- assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows);
- assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows);
+ assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows);
+ assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows);
+
+ // Part B: Verify per-topic device isolation
+ if (!result1.rowsPerDevice.isEmpty()) {
+ Integer d1Rows = result1.rowsPerDevice.get(database + ".d1");
+ Integer d2Rows = result1.rowsPerDevice.get(database + ".d2");
+ assertEquals("Expected 30 rows from d1 (topic1)", 30, d1Rows != null ? d1Rows : 0);
+ assertEquals("Expected 40 rows from d2 (topic2)", 40, d2Rows != null ? d2Rows : 0);
+ System.out.println(" Multi-topic isolation verified: d1=" + d1Rows + ", d2=" + d2Rows);
+ }
System.out.println(
- " Independent consumption verified: group1="
+ " Multi-group isolation verified: group1="
+ result1.totalRows
+ ", group2="
+ result2.totalRows);
} finally {
- // Clean up both consumers
if (consumer1 != null) {
try {
- consumer1.unsubscribe(topicName);
+ consumer1.unsubscribe(topicName1, topicName2);
} catch (Exception e) {
- // ignore
+ /* ignore */
}
try {
consumer1.close();
} catch (Exception e) {
- // ignore
+ /* ignore */
}
}
if (consumer2 != null) {
try {
- consumer2.unsubscribe(topicName);
+ consumer2.unsubscribe(topicName1, topicName2);
} catch (Exception e) {
- // ignore
+ /* ignore */
}
try {
consumer2.close();
} catch (Exception e) {
- // ignore
+ /* ignore */
}
}
- dropTopic(topicName);
+ dropTopic(topicName1);
+ dropTopic(topicName2);
deleteDatabase(database);
}
}
- // ============================
- // Test 10: Multi Topic Subscription
- // ============================
+ // ======================================================================
+ // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix)
+ // ======================================================================
/**
- * One consumer subscribes to two different topics with different path filters. Verifies that each
- * topic delivers only its matching data, and no cross-contamination occurs.
+ * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The
+ * pending queue overflow triggers gaps, which should be recovered from WAL.
+ *
+ * Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one
+ * {@code pendingEntries.offer()}. A single {@code session.insertTablet(tablet)} with N rows in
+ * one time partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To
+ * actually overflow, we need 4096+ individual write() calls arriving faster than the
+ * prefetch thread can drain. We achieve this with multiple concurrent writer threads, each
+ * performing individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate.
+ *
+ *
Note: Gap occurrence is inherently timing-dependent (race between writers and the
+ * prefetch drain loop). This test maximizes the probability by using concurrent threads, but
+ * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling
+ * from WAL" messages to confirm the gap path was exercised.
+ *
+ *
Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to
+ * the next prefetch iteration.
*/
- private static void testMultiTopicSubscription() throws Exception {
+ private static void testBurstWriteGapRecovery() throws Exception {
String database = nextDatabase();
- String topicName1 = "topic_multi_" + testCounter + "_a";
- String topicName2 = "topic_multi_" + testCounter + "_b";
+ String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
String consumerId = nextConsumerId();
SubscriptionTreePullConsumer consumer = null;
try {
- // Create database with two device groups
try (ISession session = openSession()) {
createDatabase(session, database);
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database));
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database));
session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- // Topic 1: covers d1 only
- createTopic(topicName1, database + ".d1.**");
- // Topic 2: covers d2 only
- createTopic(topicName2, database + ".d2.**");
+ createTopic(topicName, database + ".**");
Thread.sleep(1000);
consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName1, topicName2);
+ consumer.subscribe(topicName);
Thread.sleep(3000);
- // Write 30 rows to d1 and 40 rows to d2
- System.out.println(" Writing 30 rows to d1, 40 rows to d2");
- try (ISession session = openSession()) {
- for (int i = 1; i <= 40; i++) {
- if (i <= 30) {
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
- }
- session.executeNonQueryStatement(
- String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20));
- }
+ // Use multiple concurrent writer threads with individual SQL INSERTs.
+ // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer().
+ // With N threads writing concurrently, aggregate rate should exceed drain rate
+ // and overflow the 4096-capacity queue, creating gaps.
+ final int writerThreads = 4;
+ final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096
+ final int totalRows = writerThreads * rowsPerThread;
+ final AtomicInteger errorCount = new AtomicInteger(0);
+ final CountDownLatch startLatch = new CountDownLatch(1);
+ final CountDownLatch doneLatch = new CountDownLatch(writerThreads);
+
+ System.out.println(
+ " Burst writing "
+ + totalRows
+ + " rows via "
+ + writerThreads
+ + " concurrent threads ("
+ + rowsPerThread
+ + " individual SQL INSERTs each)");
+ System.out.println(
+ " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)");
+
+ ExecutorService executor = Executors.newFixedThreadPool(writerThreads);
+ for (int t = 0; t < writerThreads; t++) {
+ final int threadId = t;
+ final int startTs = threadId * rowsPerThread + 1;
+ executor.submit(
+ () -> {
+ try {
+ startLatch.await(); // all threads start at the same time
+ try (ISession session = openSession()) {
+ for (int i = 0; i < rowsPerThread; i++) {
+ int ts = startTs + i;
+ session.executeNonQueryStatement(
+ String.format(
+ "INSERT INTO %s.d1(time, s1) VALUES (%d, %d)",
+ database, ts, (long) ts * 10));
+ }
+ }
+ } catch (Exception e) {
+ System.out.println(" Writer thread " + threadId + " error: " + e.getMessage());
+ errorCount.incrementAndGet();
+ } finally {
+ doneLatch.countDown();
+ }
+ });
}
- Thread.sleep(2000);
- // Poll all data — should get d1 rows (via topic1) + d2 rows (via topic2)
- System.out.println(" Polling (expecting 30 from d1 + 40 from d2 = 70 total)...");
- PollResult result = pollUntilComplete(consumer, 70, 80);
- System.out.println(" Result: " + result);
+ // Fire all threads simultaneously
+ startLatch.countDown();
+ doneLatch.await();
+ executor.shutdown();
- assertEquals("Expected exactly 70 rows total (30 d1 + 40 d2)", 70, result.totalRows);
- if (!result.rowsPerDevice.isEmpty()) {
- Integer d1Rows = result.rowsPerDevice.get(database + ".d1");
- Integer d2Rows = result.rowsPerDevice.get(database + ".d2");
- assertEquals("Expected 30 rows from d1", 30, d1Rows != null ? d1Rows : 0);
- assertEquals("Expected 40 rows from d2", 40, d2Rows != null ? d2Rows : 0);
- System.out.println(
- " Multi-topic isolation verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows");
+ if (errorCount.get() > 0) {
+ System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors");
}
+
+ // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes
+ System.out.println(
+ " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)...");
+ System.out.println(
+ " (Check server logs for 'gap detected' to confirm gap recovery was triggered)");
+ PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true);
+ System.out.println(" Result: " + result);
+
+ assertEquals(
+ "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)",
+ totalRows,
+ result.totalRows);
} finally {
- // Clean up consumer, both topics, and database
- if (consumer != null) {
- try {
- consumer.unsubscribe(topicName1, topicName2);
- } catch (Exception e) {
- // ignore
- }
- try {
- consumer.close();
- } catch (Exception e) {
- // ignore
- }
- }
- dropTopic(topicName1);
- dropTopic(topicName2);
- deleteDatabase(database);
+ cleanup(consumer, topicName, database);
}
}
- // ============================
- // Test 11: Flush Data Delivery
- // ============================
+ // ======================================================================
+ // Test 8: Commit After Unsubscribe (NEW — tests H7 fix)
+ // ======================================================================
/**
- * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable
- * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps
- * entries available until committed by the subscription consumer.
+ * Tests that commit still works correctly after the consumer has unsubscribed (queue has been
+ * torn down). The commit routing should use metadata-based topic config check instead of runtime
+ * queue state.
+ *
+ *
Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue().
*/
- private static void testFlushDataDelivery() throws Exception {
+ private static void testCommitAfterUnsubscribe() throws Exception {
String database = nextDatabase();
String topicName = nextTopic();
String consumerGroupId = nextConsumerGroup();
@@ -1245,196 +1219,76 @@ private static void testFlushDataDelivery() throws Exception {
consumer.subscribe(topicName);
Thread.sleep(3000);
- // Write 50 rows, then flush before polling
- System.out.println(" Writing 50 rows then flushing");
+ // Write data
+ System.out.println(" Writing 50 rows");
try (ISession session = openSession()) {
for (int i = 1; i <= 50; i++) {
session.executeNonQueryStatement(
String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10));
}
- System.out.println(" Flushing...");
- session.executeNonQueryStatement("flush");
}
Thread.sleep(2000);
- // Poll — all 50 rows should be delivered despite flush
- System.out.println(" Polling after flush...");
- PollResult result = pollUntilComplete(consumer, 50, 70);
- System.out.println(" Result: " + result);
- assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows);
- } finally {
- cleanup(consumer, topicName, database);
- }
- }
-
- // ============================
- // Test 12: Cross-Partition Aligned Timeseries (Multiple Write Methods)
- // ============================
- /**
- * Tests cross-partition aligned timeseries with 6 data types, written via six different aligned
- * methods. Timestamps are spaced >1 week apart to force different time partitions, exercising the
- * WAL merge path for multi-partition inserts.
- *
- *
Write methods (all aligned):
- *
- *
- * SQL single row
- * SQL multi-row (cross-partition)
- * session.insertAlignedRecord (single row)
- * session.insertAlignedRecordsOfOneDevice (cross-partition)
- * session.insertAlignedTablet (cross-partition)
- * session.insertAlignedTablets (cross-partition)
- *
- */
- private static void testCrossPartitionAligned() throws Exception {
- String database = nextDatabase();
- String topicName = nextTopic();
- String consumerGroupId = nextConsumerGroup();
- String consumerId = nextConsumerId();
- SubscriptionTreePullConsumer consumer = null;
-
- // Gap slightly over 1 week (default partition interval = 604,800,000ms)
- final long GAP = 604_800_001L;
- final String device = database + ".d_aligned";
-
- try {
- // Create aligned timeseries with 6 data types
- try (ISession session = openSession()) {
- createDatabase(session, database);
- session.executeNonQueryStatement(
- String.format(
- "CREATE ALIGNED TIMESERIES %s.d_aligned"
- + "(s_int32 INT32, s_int64 INT64, s_float FLOAT,"
- + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)",
- database));
- // Init row to force DataRegion creation
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
- + " s_double, s_bool, s_text)"
- + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')",
- database));
- session.executeNonQueryStatement("flush");
+ // Poll WITHOUT commit
+ System.out.println(" Polling WITHOUT commit...");
+ List uncommittedMessages = new ArrayList<>();
+ int polledRows = 0;
+ for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) {
+ List msgs = consumer.poll(Duration.ofMillis(2000));
+ if (msgs.isEmpty()) {
+ if (polledRows > 0) break;
+ Thread.sleep(500);
+ continue;
+ }
+ for (SubscriptionMessage msg : msgs) {
+ uncommittedMessages.add(msg);
+ for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) {
+ while (ds.hasNext()) {
+ ds.next();
+ polledRows++;
+ }
+ }
+ }
}
+ System.out.println(
+ " Polled "
+ + polledRows
+ + " rows, holding "
+ + uncommittedMessages.size()
+ + " uncommitted messages");
+ assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows);
+
+ // Unsubscribe (tears down the consensus queue)
+ System.out.println(" Unsubscribing (queue teardown)...");
+ consumer.unsubscribe(topicName);
Thread.sleep(2000);
- createTopic(topicName, database + ".**");
- Thread.sleep(1000);
-
- consumer = createConsumer(consumerId, consumerGroupId);
- consumer.subscribe(topicName);
- Thread.sleep(3000);
-
- // Shared measurement info for Session API calls
- List measurements =
- Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text");
- List types =
- Arrays.asList(
- TSDataType.INT32,
- TSDataType.INT64,
- TSDataType.FLOAT,
- TSDataType.DOUBLE,
- TSDataType.BOOLEAN,
- TSDataType.TEXT);
-
- // Shared schema for Tablet API calls
- List schemas = new ArrayList<>();
- schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32));
- schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64));
- schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT));
- schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE));
- schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN));
- schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT));
-
- System.out.println(" Writing cross-partition aligned data via 6 methods");
- int totalExpected = 0;
-
- try (ISession session = openSession()) {
-
- // --- Method 1: SQL single row ---
- long t1 = 1;
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
- + " s_double, s_bool, s_text)"
- + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')",
- database, t1));
- totalExpected += 1;
- System.out.println(" Method 1 (SQL single row): 1 row");
-
- // --- Method 2: SQL multi-row (cross-partition, 2 rows >1 week apart) ---
- long t2a = 1 + GAP;
- long t2b = 1 + 2 * GAP;
- session.executeNonQueryStatement(
- String.format(
- "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float,"
- + " s_double, s_bool, s_text)"
- + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a'),"
- + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')",
- database, t2a, t2b));
- totalExpected += 2;
- System.out.println(" Method 2 (SQL multi-row, cross-partition): 2 rows");
-
- // --- Method 3: insertAlignedRecord (single row) ---
- long t3 = 1 + 3 * GAP;
- List values3 = Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single");
- session.insertAlignedRecord(device, t3, measurements, types, values3);
- totalExpected += 1;
- System.out.println(" Method 3 (insertAlignedRecord): 1 row");
-
- // --- Method 4: insertAlignedRecordsOfOneDevice (cross-partition, 2 rows) ---
- long t4a = 1 + 4 * GAP;
- long t4b = 1 + 5 * GAP;
- session.insertAlignedRecordsOfOneDevice(
- device,
- Arrays.asList(t4a, t4b),
- Arrays.asList(measurements, measurements),
- Arrays.asList(types, types),
- Arrays.asList(
- Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"),
- Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b")));
- totalExpected += 2;
- System.out.println(
- " Method 4 (insertAlignedRecordsOfOneDevice, cross-partition): 2 rows");
-
- // --- Method 5: insertAlignedTablet (cross-partition, 2 rows) ---
- long t5a = 1 + 6 * GAP;
- long t5b = 1 + 7 * GAP;
- Tablet tablet5 = new Tablet(device, schemas, 2);
- addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a");
- addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b");
- session.insertAlignedTablet(tablet5);
- totalExpected += 2;
- System.out.println(" Method 5 (insertAlignedTablet, cross-partition): 2 rows");
-
- // --- Method 6: insertAlignedTablets (cross-partition, 2 rows) ---
- long t6a = 1 + 8 * GAP;
- long t6b = 1 + 9 * GAP;
- Tablet tablet6 = new Tablet(device, schemas, 2);
- addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a");
- addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b");
- Map tabletMap = new HashMap<>();
- tabletMap.put(device, tablet6);
- session.insertAlignedTablets(tabletMap);
- totalExpected += 2;
- System.out.println(" Method 6 (insertAlignedTablets, cross-partition): 2 rows");
+ // Now commit the previously polled messages — should NOT throw
+ System.out.println(
+ " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe...");
+ boolean commitSucceeded = true;
+ for (SubscriptionMessage msg : uncommittedMessages) {
+ try {
+ consumer.commitSync(msg);
+ } catch (Exception e) {
+ System.out.println(" Commit threw exception: " + e.getMessage());
+ commitSucceeded = false;
+ }
}
- System.out.println(" Total expected rows: " + totalExpected);
- Thread.sleep(2000);
-
- System.out.println(" Polling...");
- PollResult result = pollUntilComplete(consumer, totalExpected, 100);
- System.out.println(" Result: " + result);
-
- assertEquals(
- "Expected exactly " + totalExpected + " cross-partition aligned rows",
- totalExpected,
- result.totalRows);
- assertAtLeast(
- "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size());
+ // The commit may silently succeed or fail gracefully — the key is no crash
+ System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded);
+ System.out.println(" (Key: no exception crash, routing handled gracefully)");
} finally {
- cleanup(consumer, topicName, database);
+ if (consumer != null) {
+ try {
+ consumer.close();
+ } catch (Exception e) {
+ /* ignore */
+ }
+ }
+ dropTopic(topicName);
+ deleteDatabase(database);
}
}
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
index c494ae05d01b0..8cb168272b295 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java
@@ -82,6 +82,7 @@
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.BiConsumer;
+import java.util.function.Consumer;
import java.util.stream.Collectors;
public class IoTConsensus implements IConsensus {
@@ -105,6 +106,12 @@ public class IoTConsensus implements IConsensus {
*/
public static volatile BiConsumer onNewPeerCreated;
+ /**
+ * Optional callback invoked before a local peer is deleted via {@link #deleteLocalPeer}. Used by
+ * the subscription system to unbind and clean up prefetching queues before the region is removed.
+ */
+ public static volatile Consumer onPeerRemoved;
+
private final IClientManager clientManager;
private final IClientManager syncClientManager;
private final ScheduledExecutorService backgroundTaskService;
@@ -321,6 +328,18 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers)
@Override
public void deleteLocalPeer(ConsensusGroupId groupId) throws ConsensusException {
KillPoint.setKillPoint(IoTConsensusDeleteLocalPeerKillPoints.BEFORE_DELETE);
+
+ // Notify subscription system before stopping the peer, so that subscription queues can
+ // properly unregister from the still-alive serverImpl.
+ final Consumer removeCallback = onPeerRemoved;
+ if (removeCallback != null) {
+ try {
+ removeCallback.accept(groupId);
+ } catch (final Exception e) {
+ logger.warn("onPeerRemoved callback failed for group {}", groupId, e);
+ }
+ }
+
AtomicBoolean exist = new AtomicBoolean(false);
stateMachineMap.computeIfPresent(
groupId,
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
index bb5d4aa603417..37222c47d35ff 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java
@@ -968,7 +968,7 @@ void checkAndUpdateIndex() {
* If there is only one replica, set it to Long.MAX_VALUE. If there are multiple replicas, get the
* latest SafelyDeletedSearchIndex again. This enables wal to be deleted in a timely manner.
*/
- void checkAndUpdateSafeDeletedSearchIndex() {
+ public void checkAndUpdateSafeDeletedSearchIndex() {
if (configuration.isEmpty()) {
logger.error(
"Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time.");
diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java
index 374691bf38bf1..51704a24c74a5 100644
--- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java
+++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java
@@ -167,15 +167,16 @@ public synchronized OptionalLong getMinFlushedSyncIndex() {
return threads.stream().mapToLong(LogDispatcherThread::getLastFlushedSyncIndex).min();
}
- public void checkAndFlushIndex() {
+ public synchronized void checkAndFlushIndex() {
if (!threads.isEmpty()) {
threads.forEach(
thread -> {
IndexController controller = thread.getController();
controller.update(controller.getCurrentIndex(), true);
});
- // do not set SafelyDeletedSearchIndex as it is Long.MAX_VALUE when replica is 1
- reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex());
+ // Use subscription-aware safe-delete to avoid deleting WAL entries
+ // still needed by subscription consumers.
+ impl.checkAndUpdateSafeDeletedSearchIndex();
}
}
@@ -397,8 +398,9 @@ public void updateSafelyDeletedSearchIndex() {
// indicating that insert nodes whose search index are before this value can be deleted
// safely.
//
- // Use minFlushedSyncIndex here to reserve the WAL which are not flushed and support kill -9.
- reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex());
+ // Use subscription-aware safe-delete to avoid deleting WAL entries
+ // still needed by subscription consumers.
+ impl.checkAndUpdateSafeDeletedSearchIndex();
// notify
if (impl.unblockWrite()) {
impl.signal();
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
index 220ad3e449951..abf9161962bff 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java
@@ -24,6 +24,7 @@
import org.apache.iotdb.db.subscription.broker.SubscriptionBroker;
import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter;
import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager;
+import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler;
import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager;
import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask;
@@ -188,7 +189,8 @@ public List commit(
final List consensusContexts = new ArrayList<>();
for (final SubscriptionCommitContext ctx : commitContexts) {
final String topicName = ctx.getTopicName();
- if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) {
+ if (Objects.nonNull(consensusBroker)
+ && ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) {
consensusContexts.add(ctx);
} else {
pipeContexts.add(ctx);
@@ -370,6 +372,20 @@ public void unbindConsensusPrefetchingQueue(
prefetchingQueueCount.invalidate();
}
+ public void unbindByRegion(final String regionId) {
+ int totalClosed = 0;
+ for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) {
+ totalClosed += broker.unbindByRegion(regionId);
+ }
+ if (totalClosed > 0) {
+ prefetchingQueueCount.invalidate();
+ LOGGER.info(
+ "Subscription: unbound {} consensus prefetching queue(s) for removed region [{}]",
+ totalClosed,
+ regionId);
+ }
+ }
+
public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) {
final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId);
if (Objects.isNull(pipeBroker)) {
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java
index 84d89ef9a8f39..1c567965d911b 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java
@@ -32,6 +32,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -124,12 +125,12 @@ public List poll(
eventsToPoll.add(event);
totalSize += currentSize;
- if (totalSize + currentSize > maxBytes) {
+ if (totalSize >= maxBytes) {
break;
}
}
- if (totalSize > maxBytes) {
+ if (totalSize >= maxBytes) {
break;
}
}
@@ -353,6 +354,30 @@ public void unbindConsensusPrefetchingQueue(final String topicName) {
brokerId);
}
+ public int unbindByRegion(final String regionId) {
+ int closedCount = 0;
+ for (final Map.Entry> entry :
+ topicNameToConsensusPrefetchingQueues.entrySet()) {
+ final List queues = entry.getValue();
+ final Iterator iterator = queues.iterator();
+ while (iterator.hasNext()) {
+ final ConsensusPrefetchingQueue q = iterator.next();
+ if (regionId.equals(q.getConsensusGroupId())) {
+ q.close();
+ iterator.remove();
+ closedCount++;
+ LOGGER.info(
+ "Subscription: closed consensus prefetching queue for topic [{}] region [{}] "
+ + "in consumer group [{}] due to region removal",
+ entry.getKey(),
+ regionId,
+ brokerId);
+ }
+ }
+ }
+ return closedCount;
+ }
+
@Override
public void removeQueue(final String topicName) {
final List queues =
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java
index fbde6cee8c2fe..9d3f2b283c556 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java
@@ -43,6 +43,7 @@
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
@@ -190,37 +191,31 @@ private List convertInsertTabletNode(final InsertTabletNode node) {
return Collections.emptyList();
}
- // Build Tablet with all rows
final int columnCount = matchedColumnIndices.size();
+ final boolean allColumnsMatch = (columnCount == measurements.length);
+
+ // Build schemas (always needed)
final List schemas = new ArrayList<>(columnCount);
for (final int colIdx : matchedColumnIndices) {
schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx]));
}
- final Tablet tablet = new Tablet(deviceId.toString(), schemas, rowCount);
-
- for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) {
- tablet.addTimestamp(rowIdx, times[rowIdx]);
-
- for (int colIdx = 0; colIdx < columnCount; colIdx++) {
- final int originalColIdx = matchedColumnIndices.get(colIdx);
- final boolean isNull =
- (bitMaps != null
- && bitMaps[originalColIdx] != null
- && bitMaps[originalColIdx].isMarked(rowIdx));
-
- if (isNull) {
- if (tablet.getBitMaps() == null) {
- tablet.initBitMaps();
- }
- tablet.getBitMaps()[colIdx].mark(rowIdx);
- } else {
- copyColumnValue(
- tablet, rowIdx, colIdx, dataTypes[originalColIdx], columns[originalColIdx], rowIdx);
- }
+ // Build column arrays and bitmaps using bulk copy
+ final long[] newTimes = Arrays.copyOf(times, rowCount);
+ final Object[] newColumns = new Object[columnCount];
+ final BitMap[] newBitMaps = new BitMap[columnCount];
+
+ for (int i = 0; i < columnCount; i++) {
+ final int originalColIdx = allColumnsMatch ? i : matchedColumnIndices.get(i);
+ newColumns[i] = copyColumnArray(dataTypes[originalColIdx], columns[originalColIdx], rowCount);
+ if (bitMaps != null && bitMaps[originalColIdx] != null) {
+ newBitMaps[i] = new BitMap(rowCount);
+ BitMap.copyOfRange(bitMaps[originalColIdx], 0, newBitMaps[i], 0, rowCount);
}
}
- tablet.setRowSize(rowCount);
+
+ final Tablet tablet =
+ new Tablet(deviceId.toString(), schemas, newTimes, newColumns, newBitMaps, rowCount);
return Collections.singletonList(tablet);
}
@@ -327,26 +322,27 @@ private List convertRelationalInsertTabletNode(final RelationalInsertTab
schemas.add(new MeasurementSchema(measurements[i], dataTypes[i]));
}
- final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, rowCount);
-
- for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) {
- tablet.addTimestamp(rowIdx, times[rowIdx]);
+ // Build column arrays and bitmaps using bulk copy
+ final long[] newTimes = Arrays.copyOf(times, rowCount);
+ final Object[] newColumns = new Object[columnCount];
+ final BitMap[] newBitMaps = new BitMap[columnCount];
- for (int colIdx = 0; colIdx < columnCount; colIdx++) {
- final boolean isNull =
- (bitMaps != null && bitMaps[colIdx] != null && bitMaps[colIdx].isMarked(rowIdx));
-
- if (isNull) {
- if (tablet.getBitMaps() == null) {
- tablet.initBitMaps();
- }
- tablet.getBitMaps()[colIdx].mark(rowIdx);
- } else {
- copyColumnValue(tablet, rowIdx, colIdx, dataTypes[colIdx], columns[colIdx], rowIdx);
- }
+ for (int colIdx = 0; colIdx < columnCount; colIdx++) {
+ newColumns[colIdx] = copyColumnArray(dataTypes[colIdx], columns[colIdx], rowCount);
+ if (bitMaps != null && bitMaps[colIdx] != null) {
+ newBitMaps[colIdx] = new BitMap(rowCount);
+ BitMap.copyOfRange(bitMaps[colIdx], 0, newBitMaps[colIdx], 0, rowCount);
}
}
- tablet.setRowSize(rowCount);
+
+ final Tablet tablet =
+ new Tablet(
+ tableName != null ? tableName : "",
+ schemas,
+ newTimes,
+ newColumns,
+ newBitMaps,
+ rowCount);
return Collections.singletonList(tablet);
}
@@ -387,6 +383,65 @@ private List getMatchedTreeColumnIndices(
return matchedIndices;
}
+ /**
+ * Bulk-copies a typed column array using System.arraycopy. Returns a new array of the same type
+ * containing the first {@code rowCount} elements.
+ */
+ private Object copyColumnArray(
+ final TSDataType dataType, final Object sourceColumn, final int rowCount) {
+ switch (dataType) {
+ case BOOLEAN:
+ {
+ final boolean[] src = (boolean[]) sourceColumn;
+ final boolean[] dst = new boolean[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case INT32:
+ case DATE:
+ {
+ final int[] src = (int[]) sourceColumn;
+ final int[] dst = new int[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case INT64:
+ case TIMESTAMP:
+ {
+ final long[] src = (long[]) sourceColumn;
+ final long[] dst = new long[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case FLOAT:
+ {
+ final float[] src = (float[]) sourceColumn;
+ final float[] dst = new float[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case DOUBLE:
+ {
+ final double[] src = (double[]) sourceColumn;
+ final double[] dst = new double[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ case TEXT:
+ case BLOB:
+ case STRING:
+ {
+ final Binary[] src = (Binary[]) sourceColumn;
+ final Binary[] dst = new Binary[rowCount];
+ System.arraycopy(src, 0, dst, 0, rowCount);
+ return dst;
+ }
+ default:
+ LOGGER.warn("Unsupported data type for bulk copy: {}", dataType);
+ return sourceColumn;
+ }
+ }
+
/**
* Adds a single value to the tablet at the specified position.
*
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java
index 28743d1aae73c..8b5c2cf25a8e5 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java
@@ -32,6 +32,7 @@
import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode;
import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode;
import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry;
+import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode;
import org.apache.iotdb.db.subscription.event.SubscriptionEvent;
import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload;
import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext;
@@ -154,6 +155,11 @@ public class ConsensusPrefetchingQueue {
private static final int MAX_PREFETCHING_QUEUE_SIZE = 256;
+ private static final long WAL_RETENTION_WARN_THRESHOLD = 100_000;
+
+ /** Counter of WAL gap entries that could not be filled (data loss). */
+ private final AtomicLong walGapSkippedEntries = new AtomicLong(0);
+
private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true);
private volatile boolean isClosed = false;
@@ -215,12 +221,27 @@ public ConsensusPrefetchingQueue(
/**
* Returns the earliest outstanding (uncommitted) search index for WAL pinning. If there are no
* outstanding events, returns the next expected search index (nothing to pin beyond what we've
- * already processed).
+ * already processed). Also monitors WAL retention gap for slow consumer detection.
*/
private long getEarliestOutstandingSearchIndex() {
final Map.Entry first = outstandingCommitIdToStartIndex.firstEntry();
if (first != null) {
- return first.getValue();
+ final long earliestIndex = first.getValue();
+ // WAL retention health check: warn if outstanding gap grows too large
+ final long currentIndex = nextExpectedSearchIndex.get();
+ final long retentionGap = currentIndex - earliestIndex;
+ if (retentionGap > WAL_RETENTION_WARN_THRESHOLD) {
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: WAL retention gap is {} entries "
+ + "(earliest outstanding={}, current={}). "
+ + "A slow or stalled consumer is pinning WAL files and may cause disk exhaustion. "
+ + "Consider committing events or increasing consumer throughput.",
+ this,
+ retentionGap,
+ earliestIndex,
+ currentIndex);
+ }
+ return earliestIndex;
}
return nextExpectedSearchIndex.get();
}
@@ -429,11 +450,11 @@ private void prefetchLoop() {
t.getClass().getName(),
t.getMessage(),
t);
- if (t instanceof Error) {
+ if (t instanceof VirtualMachineError) {
LOGGER.error(
- "ConsensusPrefetchingQueue {}: caught Error in prefetch loop, "
- + "will attempt to continue",
- this);
+ "ConsensusPrefetchingQueue {}: caught VirtualMachineError, stopping thread", this);
+ markClosed();
+ break;
}
try {
Thread.sleep(100);
@@ -478,7 +499,24 @@ private void processBatchFromPending(final List batch)
expected,
searchIndex,
searchIndex - expected);
- fillGapFromWAL(expected, searchIndex, batchedTablets);
+ final long gapMaxIndex = fillGapFromWAL(expected, searchIndex, batchedTablets);
+ if (gapMaxIndex > batchEndSearchIndex) {
+ batchEndSearchIndex = gapMaxIndex;
+ }
+
+ // If gap was not fully filled (e.g., WAL timeout), do NOT skip the gap.
+ // Break and defer remaining entries to the next prefetch loop iteration.
+ // WAL pin ensures the missing entries won't be deleted.
+ if (nextExpectedSearchIndex.get() < searchIndex) {
+ LOGGER.warn(
+ "ConsensusPrefetchingQueue {}: gap [{}, {}) not fully filled (reached {}). "
+ + "Deferring remaining batch to next prefetch iteration.",
+ this,
+ expected,
+ searchIndex,
+ nextExpectedSearchIndex.get());
+ break;
+ }
}
if (searchIndex < nextExpectedSearchIndex.get()) {
@@ -555,11 +593,14 @@ private void processBatchFromPending(final List batch)
/**
* Fills a gap in the pending queue by reading entries from WAL. Called when gap is detected
* between nextExpectedSearchIndex and an incoming entry's searchIndex.
+ *
+ * @return the maximum searchIndex processed during gap filling, or -1 if no entries processed
*/
- private void fillGapFromWAL(
+ private long fillGapFromWAL(
final long fromIndex, final long toIndex, final List batchedTablets) {
// Re-position WAL reader to the gap start
reqIterator = consensusReqReader.getReqIterator(fromIndex);
+ long maxProcessedIndex = -1;
while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) {
try {
@@ -575,6 +616,9 @@ private void fillGapFromWAL(
batchedTablets.addAll(tablets);
}
nextExpectedSearchIndex.set(walIndex + 1);
+ if (walIndex > maxProcessedIndex) {
+ maxProcessedIndex = walIndex;
+ }
} catch (final Exception e) {
LOGGER.warn(
"ConsensusPrefetchingQueue {}: error filling gap from WAL at index {}",
@@ -601,6 +645,9 @@ private void fillGapFromWAL(
batchedTablets.addAll(tablets);
}
nextExpectedSearchIndex.set(walIndex + 1);
+ if (walIndex > maxProcessedIndex) {
+ maxProcessedIndex = walIndex;
+ }
}
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
@@ -612,6 +659,24 @@ private void fillGapFromWAL(
toIndex);
}
}
+
+ // If the gap still cannot be fully filled (WAL truncated/deleted), skip ahead to avoid
+ // blocking consumption indefinitely. This results in data loss for the skipped range.
+ if (nextExpectedSearchIndex.get() < toIndex) {
+ final long skipped = toIndex - nextExpectedSearchIndex.get();
+ walGapSkippedEntries.addAndGet(skipped);
+ LOGGER.error(
+ "ConsensusPrefetchingQueue {}: WAL gap [{}, {}) cannot be filled - {} entries lost. "
+ + "Total skipped entries so far: {}. This indicates WAL truncation or deletion.",
+ this,
+ nextExpectedSearchIndex.get(),
+ toIndex,
+ skipped,
+ walGapSkippedEntries.get());
+ nextExpectedSearchIndex.set(toIndex);
+ }
+
+ return maxProcessedIndex;
}
/**
@@ -623,8 +688,24 @@ private void tryCatchUpFromWAL() {
syncReqIteratorPosition();
if (!reqIterator.hasNext()) {
- // No data on disk either - nothing to do
- return;
+ // The WAL iterator excludes the current-writing WAL file for concurrency safety.
+ // If entries exist in WAL but are all in the current file (e.g., after pending queue
+ // overflow), we need to trigger a WAL file roll to make them readable.
+ final long currentWALIndex = consensusReqReader.getCurrentSearchIndex();
+ if (nextExpectedSearchIndex.get() <= currentWALIndex
+ && consensusReqReader instanceof WALNode) {
+ LOGGER.info(
+ "ConsensusPrefetchingQueue {}: subscription behind (at {} vs WAL {}), "
+ + "triggering WAL file roll to make entries readable",
+ this,
+ nextExpectedSearchIndex.get(),
+ currentWALIndex);
+ ((WALNode) consensusReqReader).rollWALFile();
+ syncReqIteratorPosition();
+ }
+ if (!reqIterator.hasNext()) {
+ return;
+ }
}
final List batchedTablets = new ArrayList<>();
@@ -1063,6 +1144,8 @@ public void cleanUp() {
inFlightEvents.values().forEach(event -> event.cleanUp(true));
inFlightEvents.clear();
+
+ outstandingCommitIdToStartIndex.clear();
} finally {
releaseWriteLock();
}
@@ -1077,11 +1160,19 @@ public void close() {
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
}
- // Unregister from IoTConsensusServerImpl (stop receiving in-memory data, unpin WAL).
- serverImpl.unregisterSubscriptionQueue(pendingEntries, walPinSupplier);
- cleanUp();
- // Persist progress before closing
- commitManager.persistAll();
+ try {
+ // Unregister from IoTConsensusServerImpl (stop receiving in-memory data, unpin WAL).
+ serverImpl.unregisterSubscriptionQueue(pendingEntries, walPinSupplier);
+ } catch (final Exception e) {
+ LOGGER.warn("ConsensusPrefetchingQueue {}: error during unregister", this, e);
+ } finally {
+ try {
+ cleanUp();
+ } finally {
+ // Persist progress before closing
+ commitManager.persistAll();
+ }
+ }
}
private SubscriptionEvent generateErrorResponse(final String errorMessage) {
@@ -1168,6 +1259,7 @@ public Map coreReportMessage() {
result.put("outstandingEventsSize", String.valueOf(outstandingCommitIdToStartIndex.size()));
result.put("pendingEntriesSize", String.valueOf(pendingEntries.size()));
result.put("commitIdGenerator", commitIdGenerator.toString());
+ result.put("walGapSkippedEntries", String.valueOf(walGapSkippedEntries.get()));
result.put("isClosed", String.valueOf(isClosed));
return result;
}
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java
index 4096394ad6a33..91883c94b1e11 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java
@@ -203,7 +203,7 @@ public void removeState(
* @param topicName the topic name
*/
public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) {
- final String prefix = consumerGroupId + "_" + topicName + "_";
+ final String prefix = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR;
final Iterator> it =
commitStates.entrySet().iterator();
while (it.hasNext()) {
@@ -228,9 +228,13 @@ public void persistAll() {
// ======================== Helper Methods ========================
+ // Use a separator that cannot appear in consumerGroupId, topicName, or regionId
+ // to prevent key collisions (e.g., "a_b" + "c" vs "a" + "b_c").
+ private static final String KEY_SEPARATOR = "##";
+
private String generateKey(
final String consumerGroupId, final String topicName, final String regionId) {
- return consumerGroupId + "_" + topicName + "_" + regionId;
+ return consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionId;
}
private File getProgressFile(final String key) {
@@ -329,8 +333,8 @@ public long getCommittedSearchIndex() {
private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000;
public void recordMapping(final long commitId, final long searchIndex) {
- commitIdToSearchIndex.put(commitId, searchIndex);
synchronized (this) {
+ commitIdToSearchIndex.put(commitId, searchIndex);
outstandingSearchIndices.add(searchIndex);
final int size = outstandingSearchIndices.size();
if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) {
@@ -358,16 +362,21 @@ public void recordMapping(final long commitId, final long searchIndex) {
* @return true if successfully committed
*/
public boolean commit(final long commitId) {
- final Long searchIndex = commitIdToSearchIndex.remove(commitId);
- if (searchIndex == null) {
- LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId);
- return false;
- }
-
progress.incrementCommitIndex();
- // Advance committed search index contiguously (gap-aware)
+ // Advance committed search index contiguously (gap-aware).
+ // Both remove from commitIdToSearchIndex and outstandingSearchIndices must be
+ // inside the same synchronized block to prevent a race with recordMapping():
+ // recordMapping: put(commitId, si) -> add(si)
+ // commit: remove(commitId) -> remove(si)
+ // Without atomicity, commit could remove from map between put and add,
+ // leaving si permanently in outstandingSearchIndices (WAL leak).
synchronized (this) {
+ final Long searchIndex = commitIdToSearchIndex.remove(commitId);
+ if (searchIndex == null) {
+ LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId);
+ return false;
+ }
outstandingSearchIndices.remove(searchIndex);
if (searchIndex > maxCommittedSearchIndex) {
maxCommittedSearchIndex = searchIndex;
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java
index b138dbceef1a2..a36b9e29fe7ed 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java
@@ -61,16 +61,20 @@ private ConsensusSubscriptionSetupHandler() {
}
/**
- * Ensures that the IoTConsensus new-peer callback is set, so that when a new DataRegion is
- * created, all active consensus subscriptions are automatically bound to the new region.
+ * Ensures that the IoTConsensus new-peer and peer-removed callbacks are set, so that when a new
+ * DataRegion is created, all active consensus subscriptions are automatically bound to the new
+ * region, and when a DataRegion is removed, all subscription queues are properly cleaned up.
*/
public static void ensureNewRegionListenerRegistered() {
- if (IoTConsensus.onNewPeerCreated != null) {
- return;
+ if (IoTConsensus.onNewPeerCreated == null) {
+ IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated;
+ LOGGER.info(
+ "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding");
+ }
+ if (IoTConsensus.onPeerRemoved == null) {
+ IoTConsensus.onPeerRemoved = ConsensusSubscriptionSetupHandler::onRegionRemoved;
+ LOGGER.info("Set IoTConsensus.onPeerRemoved callback for consensus subscription cleanup");
}
- IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated;
- LOGGER.info(
- "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding");
}
/**
@@ -93,14 +97,13 @@ private static void onNewRegionCreated(
final ConsensusSubscriptionCommitManager commitManager =
ConsensusSubscriptionCommitManager.getInstance();
- final long startSearchIndex = serverImpl.getSearchIndex() + 1;
LOGGER.info(
"New DataRegion {} created, checking {} consumer group(s) for auto-binding, "
- + "startSearchIndex={}",
+ + "currentSearchIndex={}",
groupId,
allSubscriptions.size(),
- startSearchIndex);
+ serverImpl.getSearchIndex());
for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) {
final String consumerGroupId = groupEntry.getKey();
@@ -141,12 +144,22 @@ private static void onNewRegionCreated(
final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null;
final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName);
+ // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail
+ // for brand-new regions that have no prior subscription progress.
+ final long persistedIndex =
+ commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString());
+ final long startSearchIndex =
+ (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1;
+
LOGGER.info(
- "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} (database={})",
+ "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} "
+ + "(database={}, startSearchIndex={}, persistedIndex={})",
topicName,
consumerGroupId,
groupId,
- dbTableModel);
+ dbTableModel,
+ startSearchIndex,
+ persistedIndex);
SubscriptionAgent.broker()
.bindConsensusPrefetchingQueue(
@@ -169,6 +182,26 @@ private static void onNewRegionCreated(
}
}
+ /**
+ * Callback invoked before a DataRegion (IoTConsensusServerImpl) is deleted locally. Unbinds and
+ * cleans up all subscription prefetching queues associated with the removed region across all
+ * consumer groups.
+ */
+ private static void onRegionRemoved(final ConsensusGroupId groupId) {
+ if (!(groupId instanceof DataRegionId)) {
+ return;
+ }
+ final String regionIdStr = groupId.toString();
+ LOGGER.info(
+ "DataRegion {} being removed, unbinding all consensus subscription queues", regionIdStr);
+ try {
+ SubscriptionAgent.broker().unbindByRegion(regionIdStr);
+ } catch (final Exception e) {
+ LOGGER.error(
+ "Failed to unbind consensus subscription queues for removed region {}", regionIdStr, e);
+ }
+ }
+
public static boolean isConsensusBasedTopic(final String topicName) {
try {
final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName);
@@ -316,16 +349,23 @@ private static void setupConsensusQueueForTopic(
final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null;
final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName);
- final long startSearchIndex = serverImpl.getSearchIndex() + 1;
+ // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail
+ // for brand-new regions that have no prior subscription progress.
+ final long persistedIndex =
+ commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString());
+ final long startSearchIndex =
+ (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1;
LOGGER.info(
"Binding consensus prefetching queue for topic [{}] in consumer group [{}] "
- + "to data region consensus group [{}] (database={}), startSearchIndex={}",
+ + "to data region consensus group [{}] (database={}, startSearchIndex={}, "
+ + "persistedIndex={})",
topicName,
consumerGroupId,
groupId,
dbTableModel,
- startSearchIndex);
+ startSearchIndex,
+ persistedIndex);
SubscriptionAgent.broker()
.bindConsensusPrefetchingQueue(
diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java
index 0bd526e8dbaa0..9e45f8a160127 100644
--- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java
+++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java
@@ -25,6 +25,7 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Objects;
+import java.util.concurrent.atomic.AtomicLong;
/**
* Tracks consensus subscription consumption progress for a single (consumerGroup, topic, region)
@@ -42,42 +43,42 @@
*/
public class SubscriptionConsensusProgress {
- private long searchIndex;
+ private final AtomicLong searchIndex;
- private long commitIndex;
+ private final AtomicLong commitIndex;
public SubscriptionConsensusProgress() {
this(0L, 0L);
}
public SubscriptionConsensusProgress(final long searchIndex, final long commitIndex) {
- this.searchIndex = searchIndex;
- this.commitIndex = commitIndex;
+ this.searchIndex = new AtomicLong(searchIndex);
+ this.commitIndex = new AtomicLong(commitIndex);
}
public long getSearchIndex() {
- return searchIndex;
+ return searchIndex.get();
}
public void setSearchIndex(final long searchIndex) {
- this.searchIndex = searchIndex;
+ this.searchIndex.set(searchIndex);
}
public long getCommitIndex() {
- return commitIndex;
+ return commitIndex.get();
}
public void setCommitIndex(final long commitIndex) {
- this.commitIndex = commitIndex;
+ this.commitIndex.set(commitIndex);
}
public void incrementCommitIndex() {
- this.commitIndex++;
+ this.commitIndex.incrementAndGet();
}
public void serialize(final DataOutputStream stream) throws IOException {
- ReadWriteIOUtils.write(searchIndex, stream);
- ReadWriteIOUtils.write(commitIndex, stream);
+ ReadWriteIOUtils.write(searchIndex.get(), stream);
+ ReadWriteIOUtils.write(commitIndex.get(), stream);
}
public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) {
@@ -95,21 +96,22 @@ public boolean equals(final Object o) {
return false;
}
final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o;
- return searchIndex == that.searchIndex && commitIndex == that.commitIndex;
+ return searchIndex.get() == that.searchIndex.get()
+ && commitIndex.get() == that.commitIndex.get();
}
@Override
public int hashCode() {
- return Objects.hash(searchIndex, commitIndex);
+ return Objects.hash(searchIndex.get(), commitIndex.get());
}
@Override
public String toString() {
return "SubscriptionConsensusProgress{"
+ "searchIndex="
- + searchIndex
+ + searchIndex.get()
+ ", commitIndex="
- + commitIndex
+ + commitIndex.get()
+ '}';
}
}