From 6252d137e2c66fcf8447bbd62393ebd7e387f43e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E5=AD=90=E5=9D=A4?= <55695098+DanielWang2035@users.noreply.github.com> Date: Mon, 2 Mar 2026 10:08:13 +0800 Subject: [PATCH 1/2] Subscription: implement IoTConsensus-based subscription --- example/session/pom.xml | 13 + .../iotdb/ConsensusSubscriptionTableTest.java | 1516 +++++++++++++++++ .../iotdb/ConsensusSubscriptionTest.java | 1460 ++++++++++++++++ .../CreateSubscriptionProcedure.java | 91 +- .../DropSubscriptionProcedure.java | 27 + .../iotdb/consensus/iot/IoTConsensus.java | 17 + .../consensus/iot/IoTConsensusServerImpl.java | 116 +- .../agent/SubscriptionBrokerAgent.java | 337 +++- .../agent/SubscriptionConsumerAgent.java | 44 + .../broker/ConsensusSubscriptionBroker.java | 368 ++++ .../broker/ISubscriptionBroker.java | 51 + .../broker/SubscriptionBroker.java | 34 +- .../ConsensusLogToTabletConverter.java | 487 ++++++ .../consensus/ConsensusPrefetchingQueue.java | 1179 +++++++++++++ .../ConsensusSubscriptionCommitManager.java | 416 +++++ .../ConsensusSubscriptionSetupHandler.java | 422 +++++ .../SubscriptionConsensusProgress.java | 115 ++ .../subscription/event/SubscriptionEvent.java | 5 + .../config/SubscriptionConfig.java | 2 +- .../meta/consumer/ConsumerGroupMeta.java | 25 + 20 files changed, 6637 insertions(+), 88 deletions(-) create mode 100644 example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java create mode 100644 example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java diff --git a/example/session/pom.xml b/example/session/pom.xml index e707c5b25d1ce..331fbf0c46df8 100644 --- a/example/session/pom.xml +++ b/example/session/pom.xml @@ -40,4 +40,17 @@ ${project.version} + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 11 + 11 + + + + diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java new file mode 100644 index 0000000000000..6c1da0199f663 --- /dev/null +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java @@ -0,0 +1,1516 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ITableSession; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.session.TableSessionBuilder; +import org.apache.iotdb.session.subscription.ISubscriptionTableSession; +import org.apache.iotdb.session.subscription.SubscriptionTableSessionBuilder; +import org.apache.iotdb.session.subscription.consumer.ISubscriptionTablePullConsumer; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumerBuilder; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionSessionDataSet; + +import org.apache.tsfile.enums.ColumnCategory; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +/** TODO: Move these manual tests into ITs */ +public class ConsensusSubscriptionTableTest { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static int testCounter = 0; + private static int passed = 0; + private static int failed = 0; + private static final List failedTests = new ArrayList<>(); + + public static void main(String[] args) throws Exception { + System.out.println("=== Consensus-Based Subscription Table Model Test Suite ===\n"); + + String targetTest = args.length > 0 ? args[0] : null; + + if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) { + runTest("testBasicDataDelivery", ConsensusSubscriptionTableTest::testBasicDataDelivery); + } + if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) { + runTest("testMultipleDataTypes", ConsensusSubscriptionTableTest::testMultipleDataTypes); + } + if (targetTest == null || "testTableLevelFiltering".equals(targetTest)) { + runTest("testTableLevelFiltering", ConsensusSubscriptionTableTest::testTableLevelFiltering); + } + if (targetTest == null || "testDatabaseLevelFiltering".equals(targetTest)) { + runTest( + "testDatabaseLevelFiltering", ConsensusSubscriptionTableTest::testDatabaseLevelFiltering); + } + if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { + runTest( + "testSubscribeBeforeRegion", ConsensusSubscriptionTableTest::testSubscribeBeforeRegion); + } + if (targetTest == null || "testMultipleTablesAggregation".equals(targetTest)) { + runTest( + "testMultipleTablesAggregation", + ConsensusSubscriptionTableTest::testMultipleTablesAggregation); + } + if (targetTest == null || "testMultiColumnTypes".equals(targetTest)) { + runTest("testMultiColumnTypes", ConsensusSubscriptionTableTest::testMultiColumnTypes); + } + if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) { + runTest("testPollWithoutCommit", ConsensusSubscriptionTableTest::testPollWithoutCommit); + } + if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) { + runTest( + "testMultiConsumerGroupIndependent", + ConsensusSubscriptionTableTest::testMultiConsumerGroupIndependent); + } + if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) { + runTest( + "testMultiTopicSubscription", ConsensusSubscriptionTableTest::testMultiTopicSubscription); + } + if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) { + runTest("testFlushDataDelivery", ConsensusSubscriptionTableTest::testFlushDataDelivery); + } + if (targetTest == null || "testCrossPartitionMultiWrite".equals(targetTest)) { + runTest( + "testCrossPartitionMultiWrite", + ConsensusSubscriptionTableTest::testCrossPartitionMultiWrite); + } + + // Summary + System.out.println("\n=== Test Suite Summary ==="); + System.out.println("Passed: " + passed); + System.out.println("Failed: " + failed); + if (!failedTests.isEmpty()) { + System.out.println("Failed tests: " + failedTests); + } + System.out.println("=== Done ==="); + } + + // ============================ + // Test Infrastructure + // ============================ + + @FunctionalInterface + interface TestMethod { + void run() throws Exception; + } + + private static void runTest(String name, TestMethod test) { + System.out.println("\n" + "================================================================="); + System.out.println("Running: " + name); + System.out.println("================================================================="); + try { + test.run(); + passed++; + System.out.println(">>> PASSED: " + name); + } catch (AssertionError e) { + failed++; + failedTests.add(name); + System.out.println(">>> FAILED: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } catch (Exception e) { + failed++; + failedTests.add(name); + System.out.println(">>> ERROR: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } + } + + private static String nextDatabase() { + testCounter++; + return "csub_tbl_" + testCounter; + } + + private static String nextTopic() { + return "topic_tbl_" + testCounter; + } + + private static String nextConsumerGroup() { + return "cg_tbl_" + testCounter; + } + + private static String nextConsumerId() { + return "consumer_tbl_" + testCounter; + } + + private static ITableSession openTableSession() throws Exception { + return new TableSessionBuilder() + .nodeUrls(Collections.singletonList(HOST + ":" + PORT)) + .username(USER) + .password(PASSWORD) + .build(); + } + + private static void createDatabaseAndTable( + ITableSession session, String database, String tableName, String tableSchema) + throws Exception { + session.executeNonQueryStatement("CREATE DATABASE IF NOT EXISTS " + database); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement(String.format("CREATE TABLE %s (%s)", tableName, tableSchema)); + } + + private static void deleteDatabase(String database) { + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("DROP DATABASE IF EXISTS " + database); + } catch (Exception e) { + // ignore + } + } + + private static void dropTopicTable(String topicName) { + try (ISubscriptionTableSession subSession = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + subSession.dropTopicIfExists(topicName); + } catch (Exception e) { + // ignore + } + } + + private static void createTopicTable(String topicName, String dbKey, String tableKey) + throws Exception { + try (ISubscriptionTableSession subSession = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + try { + subSession.dropTopicIfExists(topicName); + } catch (Exception e) { + // ignore + } + + Properties topicConfig = new Properties(); + topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + topicConfig.put( + TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE); + topicConfig.put(TopicConstant.DATABASE_KEY, dbKey); + topicConfig.put(TopicConstant.TABLE_KEY, tableKey); + subSession.createTopic(topicName, topicConfig); + System.out.println( + " Created topic: " + topicName + " (database=" + dbKey + ", table=" + tableKey + ")"); + } + } + + private static ISubscriptionTablePullConsumer createConsumer( + String consumerId, String consumerGroupId) throws Exception { + ISubscriptionTablePullConsumer consumer = + new SubscriptionTablePullConsumerBuilder() + .host(HOST) + .port(PORT) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .build(); + consumer.open(); + return consumer; + } + + // ============================ + // Polling & Verification + // ============================ + + /** + * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive + * empty rounds to verify no extra data arrives. + */ + private static PollResult pollUntilComplete( + ISubscriptionTablePullConsumer consumer, int expectedRows, int maxPollAttempts) { + return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true); + } + + /** + * Poll until we accumulate the expected number of rows, then verify no extra data arrives. + * + *

After reaching expectedRows, continues polling until 5 consecutive empty polls confirm + * quiescence. Any extra rows polled are included in the count (will break assertEquals). + * + * @param commitMessages if false, messages are NOT committed + */ + private static PollResult pollUntilComplete( + ISubscriptionTablePullConsumer consumer, + int expectedRows, + int maxPollAttempts, + long pollTimeoutMs, + boolean commitMessages) { + PollResult result = new PollResult(); + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + + if (messages.isEmpty()) { + consecutiveEmpty++; + // Normal completion: reached expected rows and verified quiescence + if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) { + System.out.println( + " Verified: " + + consecutiveEmpty + + " consecutive empty polls after " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Stuck: have data but cannot reach expected count + if (consecutiveEmpty >= 5 && result.totalRows > 0) { + System.out.println( + " Stuck: " + + consecutiveEmpty + + " consecutive empty polls at " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Never received anything + if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) { + System.out.println(" No data received after " + consecutiveEmpty + " polls"); + break; + } + try { + Thread.sleep(1000); + } catch (InterruptedException ignored) { + } + continue; + } + + consecutiveEmpty = 0; + + for (SubscriptionMessage message : messages) { + for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) { + String tableName = dataSet.getTableName(); + String databaseName = dataSet.getDatabaseName(); + List columnNames = dataSet.getColumnNames(); + + while (dataSet.hasNext()) { + org.apache.tsfile.read.common.RowRecord record = dataSet.next(); + result.totalRows++; + if (tableName != null) { + result.rowsPerTable.merge(tableName, 1, Integer::sum); + } + if (databaseName != null) { + result.rowsPerDatabase.merge(databaseName, 1, Integer::sum); + } + for (int i = 0; i < columnNames.size(); i++) { + result.seenColumns.add(columnNames.get(i)); + } + if (result.totalRows <= 5) { + System.out.println( + " Row: time=" + + record.getTimestamp() + + ", values=" + + record.getFields() + + ", table=" + + tableName + + ", database=" + + databaseName); + } + } + } + if (commitMessages) { + consumer.commitSync(message); + } + } + + System.out.println( + " Poll attempt " + + attempt + + ": totalRows=" + + result.totalRows + + " / expected=" + + expectedRows); + + // Stop immediately if we exceeded the expected row count + if (expectedRows > 0 && result.totalRows > expectedRows) { + System.out.println( + " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows); + break; + } + } + + return result; + } + + // ============================ + // Cleanup + // ============================ + + /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */ + private static void cleanup( + ISubscriptionTablePullConsumer consumer, String topicName, String database) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName); + deleteDatabase(database); + } + + /** Clean up with multiple databases. */ + private static void cleanup( + ISubscriptionTablePullConsumer consumer, String topicName, String... databases) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName); + for (String db : databases) { + deleteDatabase(db); + } + } + + // ============================ + // Result & Assertions + // ============================ + + static class PollResult { + int totalRows = 0; + Map rowsPerTable = new HashMap<>(); + Map rowsPerDatabase = new HashMap<>(); + Set seenColumns = new HashSet<>(); + + @Override + public String toString() { + return "PollResult{totalRows=" + + totalRows + + ", rowsPerTable=" + + rowsPerTable + + ", rowsPerDatabase=" + + rowsPerDatabase + + ", seenColumns=" + + seenColumns + + "}"; + } + } + + private static void assertEquals(String msg, int expected, int actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertTrue(String msg, boolean condition) { + if (!condition) { + throw new AssertionError(msg); + } + } + + private static void assertAtLeast(String msg, int min, int actual) { + if (actual < min) { + throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual); + } + } + + // ============================ + // Test 1: Basic Data Delivery + // ============================ + /** + * Verifies the basic consensus subscription flow with table model: write before subscribe (not + * received), write after subscribe (received), and no extra data beyond expectation. + */ + private static void testBasicDataDelivery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + // Step 1: Write initial data to create DataRegion + System.out.println(" Step 1: Writing initial data (should NOT be received)"); + try (ITableSession session = openTableSession()) { + createDatabaseAndTable( + session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD, s2 DOUBLE FIELD"); + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < 50; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)", + i * 10, i * 1.5, i)); + } + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 2: Create topic and subscribe + System.out.println(" Step 2: Creating topic and subscribing"); + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 3: Write new data AFTER subscription + System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 100; i < 200; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)", + i * 10, i * 1.5, i)); + } + } + Thread.sleep(2000); + + // Step 4: Poll and verify exact count + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 2: Multiple Data Types + // ============================ + /** + * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using + * separate INSERT statements per type (one field per INSERT), and verifies all types are + * delivered. + */ + private static void testMultipleDataTypes() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable( + session, + database, + "t1", + "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, " + + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " + + "s_text TEXT FIELD"); + session.executeNonQueryStatement("USE " + database); + // Write initial row to create DataRegion + session.executeNonQueryStatement( + "INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing data with 6 data types x 20 rows each"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 20; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', %d, %d)", i, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int64, time) VALUES ('d1', %d, %d)", + (long) i * 100000L, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_float, time) VALUES ('d1', %f, %d)", i * 1.1f, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_double, time) VALUES ('d1', %f, %d)", i * 2.2, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_bool, time) VALUES ('d1', %s, %d)", + i % 2 == 0 ? "true" : "false", i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_text, time) VALUES ('d1', 'text_%d', %d)", i, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling..."); + PollResult result = pollUntilComplete(consumer, 120, 120); + System.out.println(" Result: " + result); + + assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows); + System.out.println(" Seen columns: " + result.seenColumns); + assertTrue( + "Expected multiple column types in result, got: " + result.seenColumns, + result.seenColumns.size() > 1); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 3: Table-Level Filtering + // ============================ + /** + * Creates a topic that only matches table "t1" via TABLE_KEY. Verifies that data written to t2 is + * NOT delivered. + */ + private static void testTableLevelFiltering() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic matches only table t1 + createTopicTable(topicName, database, "t1"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to both t1 and t2 (topic filter: t1 only)"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting only t1 data)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 50 rows from t1 only", 50, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + Integer t2Rows = result.rowsPerTable.get("t2"); + assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0); + Integer t1Rows = result.rowsPerTable.get("t1"); + assertAtLeast("Expected t1 rows", 1, t1Rows != null ? t1Rows : 0); + System.out.println( + " Table filtering verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows"); + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 4: Database-Level Filtering + // ============================ + /** + * Creates a topic that only matches database db1 via DATABASE_KEY. Verifies that data written to + * db2 is NOT delivered. + */ + private static void testDatabaseLevelFiltering() throws Exception { + String database1 = nextDatabase(); + String database2 = database1 + "_other"; + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database1, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database1); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("USE " + database2); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic matches only database1 + createTopicTable(topicName, database1, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println( + " Writing to both " + + database1 + + " and " + + database2 + + " (topic filter: " + + database1 + + " only)"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database1); + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + session.executeNonQueryStatement("USE " + database2); + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting only " + database1 + " data)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 50 rows from " + database1 + " only", 50, result.totalRows); + if (!result.rowsPerDatabase.isEmpty()) { + Integer db2Rows = result.rowsPerDatabase.get(database2); + assertTrue( + "Expected NO rows from " + database2 + ", but got " + db2Rows, + db2Rows == null || db2Rows == 0); + Integer db1Rows = result.rowsPerDatabase.get(database1); + assertAtLeast("Expected " + database1 + " rows", 1, db1Rows != null ? db1Rows : 0); + System.out.println( + " Database filtering verified: " + + database1 + + "=" + + db1Rows + + " rows, " + + database2 + + "=" + + db2Rows + + " rows"); + } + } finally { + cleanup(consumer, topicName, database1, database2); + } + } + + // ============================ + // Test 5: Subscribe Before Region Creation + // ============================ + /** + * Subscribe BEFORE the database/region exists, then create database and write. Tests the + * IoTConsensus.onNewPeerCreated auto-binding path with table model. + */ + private static void testSubscribeBeforeRegion() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + System.out.println(" Step 1: Creating topic BEFORE database exists"); + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + System.out.println(" Step 2: Subscribing (no DataRegion exists yet)"); + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Step 3: Creating database, table and writing data (100 rows)"); + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < 100; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(5000); + + System.out.println(" Step 4: Polling (auto-binding should have picked up new region)..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + if (result.totalRows >= 100) { + System.out.println(" Auto-binding works! All " + result.totalRows + " rows received."); + } else if (result.totalRows > 0) { + System.out.println( + " Partial: " + result.totalRows + "/100 rows. First writes may precede binding."); + } else { + System.out.println(" No data received. Check logs for auto-binding messages."); + } + assertAtLeast( + "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 6: Multiple Tables Aggregation + // ============================ + /** Writes to t1, t2, t3 and verifies all are received via a broad topic TABLE_KEY. */ + private static void testMultipleTablesAggregation() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to 3 tables (t1, t2, t3), 30 rows each"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 100; i < 130; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting 90 total from 3 tables)..."); + PollResult result = pollUntilComplete(consumer, 90, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 90 rows total (30 per table)", 90, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + System.out.println(" Rows per table: " + result.rowsPerTable); + for (String tbl : new String[] {"t1", "t2", "t3"}) { + Integer tblRows = result.rowsPerTable.get(tbl); + assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0); + } + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 7: Multi Column Types (Table Model Equivalent of Aligned Timeseries) + // ============================ + /** + * Creates a table with 6 different FIELD types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and + * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are + * delivered correctly. This is the table model equivalent of the aligned timeseries test. + */ + private static void testMultiColumnTypes() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + // Create table with multiple field types + try (ITableSession session = openTableSession()) { + createDatabaseAndTable( + session, + database, + "t1", + "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, " + + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " + + "s_text TEXT FIELD"); + session.executeNonQueryStatement("USE " + database); + // Write initial row to force DataRegion creation + session.executeNonQueryStatement( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 rows, each with all 6 data types in a single INSERT + System.out.println(" Writing 50 rows with 6 data types per row"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)" + + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)", + i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling..."); + PollResult result = pollUntilComplete(consumer, 50, 70); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 50 rows with all field types", 50, result.totalRows); + // Verify we see columns for multiple data types + System.out.println(" Seen columns: " + result.seenColumns); + assertAtLeast( + "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 8: Poll Without Commit (Re-delivery) + // ============================ + /** + * Tests at-least-once delivery with a mixed commit/no-commit pattern. + * + *

Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we + * track committed ROWS (not events). The state machine alternates: + * + *

+ * + *

This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal + * commit path in an interleaved fashion. + */ + private static void testPollWithoutCommit() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 rows + final int totalRows = 50; + System.out.println(" Writing " + totalRows + " rows"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= totalRows; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(3000); + + // State machine: alternate between skip-commit and direct-commit. + int totalRowsCommitted = 0; + int roundNumber = 0; + boolean hasPending = false; + List pendingTimestamps = new ArrayList<>(); + Set allCommittedTimestamps = new HashSet<>(); + int redeliveryCount = 0; + + for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(5000)); + if (msgs.isEmpty()) { + Thread.sleep(1000); + continue; + } + + for (SubscriptionMessage msg : msgs) { + // Extract ALL timestamps from this event + List currentTimestamps = new ArrayList<>(); + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + currentTimestamps.add(ds.next().getTimestamp()); + } + } + assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0); + + if (hasPending) { + // === Re-delivery round: verify EXACT same timestamps === + assertTrue( + "Re-delivery timestamp list mismatch: expected=" + + pendingTimestamps + + ", actual=" + + currentTimestamps, + currentTimestamps.equals(pendingTimestamps)); + consumer.commitSync(msg); + totalRowsCommitted += currentTimestamps.size(); + allCommittedTimestamps.addAll(currentTimestamps); + hasPending = false; + redeliveryCount++; + roundNumber++; + System.out.println( + " [rows=" + + totalRowsCommitted + + "/" + + totalRows + + "] Re-delivered & committed: timestamps=" + + currentTimestamps); + } else { + // === New event round === + if (totalRowsCommitted > 0) { + boolean overlap = false; + for (Long ts : currentTimestamps) { + if (allCommittedTimestamps.contains(ts)) { + overlap = true; + break; + } + } + assertTrue( + "After commit, should receive different data (timestamps=" + + currentTimestamps + + " overlap with committed=" + + allCommittedTimestamps + + ")", + !overlap); + } + + if (roundNumber % 2 == 0) { + pendingTimestamps = new ArrayList<>(currentTimestamps); + hasPending = true; + System.out.println( + " [rows=" + + totalRowsCommitted + + "/" + + totalRows + + "] New event (NOT committed): timestamps=" + + currentTimestamps); + } else { + consumer.commitSync(msg); + totalRowsCommitted += currentTimestamps.size(); + allCommittedTimestamps.addAll(currentTimestamps); + roundNumber++; + System.out.println( + " [rows=" + + totalRowsCommitted + + "/" + + totalRows + + "] New event (committed directly): timestamps=" + + currentTimestamps); + } + } + } + } + + assertEquals("Should have committed all rows", totalRows, totalRowsCommitted); + assertTrue( + "Should have at least 1 re-delivery round (got " + redeliveryCount + ")", + redeliveryCount > 0); + + // Final poll: should be empty + System.out.println(" Final poll: expecting no data"); + int extraRows = 0; + for (int i = 0; i < 3; i++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + for (SubscriptionMessage msg : msgs) { + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + extraRows++; + } + } + } + } + assertEquals("After all committed, should receive no more data", 0, extraRows); + + System.out.println( + " At-least-once re-delivery verified: " + + totalRows + + " rows committed with " + + redeliveryCount + + " re-delivery rounds"); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 9: Multi Consumer Group Independent Consumption + // ============================ + /** + * Two consumer groups subscribe to the same topic. Verifies that each group independently + * receives ALL data (data is not partitioned/split between groups). + */ + private static void testMultiConsumerGroupIndependent() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId1 = "cg_tbl_multi_" + testCounter + "_a"; + String consumerId1 = "consumer_tbl_multi_" + testCounter + "_a"; + String consumerGroupId2 = "cg_tbl_multi_" + testCounter + "_b"; + String consumerId2 = "consumer_tbl_multi_" + testCounter + "_b"; + ISubscriptionTablePullConsumer consumer1 = null; + ISubscriptionTablePullConsumer consumer2 = null; + + try { + // Create database and initial data + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + // Two consumers in different groups both subscribe to the same topic + consumer1 = createConsumer(consumerId1, consumerGroupId1); + consumer1.subscribe(topicName); + consumer2 = createConsumer(consumerId2, consumerGroupId2); + consumer2.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 rows + System.out.println(" Writing 50 rows"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + // Poll from group 1 + System.out.println(" Polling from consumer group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 50, 70); + System.out.println(" Group 1 result: " + result1); + + // Poll from group 2 + System.out.println(" Polling from consumer group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 50, 70); + System.out.println(" Group 2 result: " + result2); + + // Both groups should have all 50 rows + assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows); + assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows); + System.out.println( + " Independent consumption verified: group1=" + + result1.totalRows + + ", group2=" + + result2.totalRows); + } finally { + // Clean up both consumers + if (consumer1 != null) { + try { + consumer1.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer1.close(); + } catch (Exception e) { + // ignore + } + } + if (consumer2 != null) { + try { + consumer2.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer2.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName); + deleteDatabase(database); + } + } + + // ============================ + // Test 10: Multi Topic Subscription + // ============================ + /** + * One consumer subscribes to two different topics with different TABLE_KEY filters. Verifies that + * each topic delivers only its matching data, and no cross-contamination occurs. + */ + private static void testMultiTopicSubscription() throws Exception { + String database = nextDatabase(); + String topicName1 = "topic_tbl_multi_" + testCounter + "_a"; + String topicName2 = "topic_tbl_multi_" + testCounter + "_b"; + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + // Create database with two tables + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic 1: covers t1 only + createTopicTable(topicName1, database, "t1"); + // Topic 2: covers t2 only + createTopicTable(topicName2, database, "t2"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName1, topicName2); + Thread.sleep(3000); + + // Write 30 rows to t1 and 40 rows to t2 + System.out.println(" Writing 30 rows to t1, 40 rows to t2"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + } + } + Thread.sleep(2000); + + // Poll all data — should get t1 rows (via topic1) + t2 rows (via topic2) + System.out.println(" Polling (expecting 30 from t1 + 40 from t2 = 70 total)..."); + PollResult result = pollUntilComplete(consumer, 70, 80); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 70 rows total (30 t1 + 40 t2)", 70, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + Integer t1Rows = result.rowsPerTable.get("t1"); + Integer t2Rows = result.rowsPerTable.get("t2"); + assertEquals("Expected 30 rows from t1", 30, t1Rows != null ? t1Rows : 0); + assertEquals("Expected 40 rows from t2", 40, t2Rows != null ? t2Rows : 0); + System.out.println( + " Multi-topic isolation verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows"); + } + } finally { + // Clean up consumer, both topics, and database + if (consumer != null) { + try { + consumer.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName1); + dropTopicTable(topicName2); + deleteDatabase(database); + } + } + + // ============================ + // Test 12: Cross-Partition Multi-Write + // ============================ + /** + * Tests that cross-partition writes via all table model write methods are correctly delivered. + * + *

Uses timestamps spaced >1 week apart (default partition interval = 604,800,000ms) to force + * cross-partition distribution. Exercises three write paths: + * + *

+ * + *

The table has 6 FIELD columns (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) plus 1 TAG. Total + * expected rows: 2 + 3 + 4 = 9. + * + *

This test verifies that when a SQL multi-row INSERT or Tablet write spans multiple time + * partitions (causing the plan node to be split into sub-nodes for each partition), all sub-nodes + * are correctly converted by the consensus subscription pipeline. + */ + private static void testCrossPartitionMultiWrite() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + // Gap > default time partition interval (7 days = 604,800,000ms) + final long GAP = 604_800_001L; + final String TABLE = "t1"; + final String SCHEMA = + "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, " + + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " + + "s_text TEXT FIELD"; + + try { + // Create database and table, write init row to force DataRegion creation + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, TABLE, SCHEMA); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing cross-partition data via 3 methods..."); + + // --- Method 1: SQL single-row INSERT (2 rows, each in its own partition) --- + long baseTs = 1_000_000_000L; + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + long ts1 = baseTs; + long ts2 = baseTs + GAP; + System.out.println(" Method 1: SQL single-row x2 (ts=" + ts1 + ", " + ts2 + ")"); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'sql_single_1', %d)", + ts1)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'sql_single_2', %d)", + ts2)); + } + + // --- Method 2: SQL multi-row INSERT (3 rows spanning 3 different partitions) --- + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + long t1 = baseTs + GAP * 2; + long t2 = baseTs + GAP * 3; + long t3 = baseTs + GAP * 4; + System.out.println( + " Method 2: SQL multi-row x3 (ts=" + t1 + ", " + t2 + ", " + t3 + ")"); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'sql_multi_1', %d), " + + "('d1', 4, 400, 4.4, 4.44, false, 'sql_multi_2', %d), " + + "('d1', 5, 500, 5.5, 5.55, true, 'sql_multi_3', %d)", + t1, t2, t3)); + } + + // --- Method 3: session.insert(Tablet) with 4 rows spanning 4 partitions --- + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + + List schemaList = new ArrayList<>(); + schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING)); + schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING)); + + List categories = + java.util.Arrays.asList( + ColumnCategory.TAG, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD); + + Tablet tablet = + new Tablet( + TABLE, + IMeasurementSchema.getMeasurementNameList(schemaList), + IMeasurementSchema.getDataTypeList(schemaList), + categories, + 10); + + for (int i = 0; i < 4; i++) { + int row = tablet.getRowSize(); + long ts = baseTs + GAP * (5 + i); // partitions 5, 6, 7, 8 + tablet.addTimestamp(row, ts); + tablet.addValue("tag1", row, "d1"); + tablet.addValue("s_int32", row, 6 + i); + tablet.addValue("s_int64", row, (long) (600 + i * 100)); + tablet.addValue("s_float", row, (6 + i) * 1.1f); + tablet.addValue("s_double", row, (6 + i) * 2.22); + tablet.addValue("s_bool", row, i % 2 == 0); + tablet.addValue("s_text", row, "tablet_" + (i + 1)); + } + System.out.println( + " Method 3: Tablet x4 (ts=" + (baseTs + GAP * 5) + ".." + (baseTs + GAP * 8) + ")"); + session.insert(tablet); + } + + Thread.sleep(2000); + + // Poll — expect 9 rows total (2 + 3 + 4) + final int expectedRows = 9; + System.out.println(" Polling (expecting " + expectedRows + " rows)..."); + PollResult result = pollUntilComplete(consumer, expectedRows, 80); + System.out.println(" Result: " + result); + + assertEquals( + "Expected exactly " + expectedRows + " cross-partition rows", + expectedRows, + result.totalRows); + // Verify we see all 6 FIELD columns plus tag + assertAtLeast( + "Expected at least 6 data columns in cross-partition result", + 6, + result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 11: Flush Data Delivery + // ============================ + /** + * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable + * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps + * entries available until committed by the subscription consumer. + */ + private static void testFlushDataDelivery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 rows, then flush before polling + System.out.println(" Writing 50 rows then flushing"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Poll — all 50 rows should be delivered despite flush + System.out.println(" Polling after flush..."); + PollResult result = pollUntilComplete(consumer, 50, 70); + System.out.println(" Result: " + result); + assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } +} diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java new file mode 100644 index 0000000000000..1ab7a910c0324 --- /dev/null +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java @@ -0,0 +1,1460 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ISession; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.session.Session; +import org.apache.iotdb.session.subscription.SubscriptionTreeSession; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionSessionDataSet; + +import org.apache.tsfile.common.conf.TSFileConfig; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +/** TODO: move these manual tests into ITs */ +public class ConsensusSubscriptionTest { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static int testCounter = 0; + private static int passed = 0; + private static int failed = 0; + private static final List failedTests = new ArrayList<>(); + + public static void main(String[] args) throws Exception { + System.out.println("=== Consensus-Based Subscription Test Suite ===\n"); + + String targetTest = args.length > 0 ? args[0] : null; + + if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) { + runTest("testBasicDataDelivery", ConsensusSubscriptionTest::testBasicDataDelivery); + } + if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) { + runTest("testMultipleDataTypes", ConsensusSubscriptionTest::testMultipleDataTypes); + } + if (targetTest == null || "testDeviceLevelFiltering".equals(targetTest)) { + runTest("testDeviceLevelFiltering", ConsensusSubscriptionTest::testDeviceLevelFiltering); + } + if (targetTest == null || "testTimeseriesLevelFiltering".equals(targetTest)) { + runTest( + "testTimeseriesLevelFiltering", ConsensusSubscriptionTest::testTimeseriesLevelFiltering); + } + if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { + runTest("testSubscribeBeforeRegion", ConsensusSubscriptionTest::testSubscribeBeforeRegion); + } + if (targetTest == null || "testMultipleDevicesAggregation".equals(targetTest)) { + runTest( + "testMultipleDevicesAggregation", + ConsensusSubscriptionTest::testMultipleDevicesAggregation); + } + if (targetTest == null || "testAlignedTimeseries".equals(targetTest)) { + runTest("testAlignedTimeseries", ConsensusSubscriptionTest::testAlignedTimeseries); + } + if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) { + runTest("testPollWithoutCommit", ConsensusSubscriptionTest::testPollWithoutCommit); + } + if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) { + runTest( + "testMultiConsumerGroupIndependent", + ConsensusSubscriptionTest::testMultiConsumerGroupIndependent); + } + if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) { + runTest("testMultiTopicSubscription", ConsensusSubscriptionTest::testMultiTopicSubscription); + } + if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) { + runTest("testFlushDataDelivery", ConsensusSubscriptionTest::testFlushDataDelivery); + } + if (targetTest == null || "testCrossPartitionAligned".equals(targetTest)) { + runTest("testCrossPartitionAligned", ConsensusSubscriptionTest::testCrossPartitionAligned); + } + + // Summary + System.out.println("\n=== Test Suite Summary ==="); + System.out.println("Passed: " + passed); + System.out.println("Failed: " + failed); + if (!failedTests.isEmpty()) { + System.out.println("Failed tests: " + failedTests); + } + System.out.println("=== Done ==="); + } + + // ============================ + // Test Infrastructure + // ============================ + + @FunctionalInterface + interface TestMethod { + void run() throws Exception; + } + + private static void runTest(String name, TestMethod test) { + System.out.println("\n" + "================================================================="); + System.out.println("Running: " + name); + System.out.println("================================================================="); + try { + test.run(); + passed++; + System.out.println(">>> PASSED: " + name); + } catch (AssertionError e) { + failed++; + failedTests.add(name); + System.out.println(">>> FAILED: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } catch (Exception e) { + failed++; + failedTests.add(name); + System.out.println(">>> ERROR: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } + } + + private static String nextDatabase() { + testCounter++; + return "root.csub_test_" + testCounter; + } + + private static String nextTopic() { + return "topic_csub_" + testCounter; + } + + private static String nextConsumerGroup() { + return "cg_csub_" + testCounter; + } + + private static String nextConsumerId() { + return "consumer_csub_" + testCounter; + } + + private static ISession openSession() throws Exception { + ISession session = + new Session.Builder().host(HOST).port(PORT).username(USER).password(PASSWORD).build(); + session.open(); + return session; + } + + private static void createDatabase(ISession session, String database) throws Exception { + try { + session.executeNonQueryStatement("CREATE DATABASE " + database); + } catch (Exception e) { + // ignore if already exists + } + } + + private static void deleteDatabase(String database) { + try (ISession session = openSession()) { + session.executeNonQueryStatement("DELETE DATABASE " + database); + } catch (Exception e) { + // ignore + } + } + + private static void dropTopic(String topicName) { + try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) { + subSession.open(); + subSession.dropTopic(topicName); + } catch (Exception e) { + // ignore + } + } + + private static void createTopic(String topicName, String path) throws Exception { + try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) { + subSession.open(); + try { + subSession.dropTopic(topicName); + } catch (Exception e) { + // ignore + } + + Properties topicConfig = new Properties(); + topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + topicConfig.put( + TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE); + topicConfig.put(TopicConstant.PATH_KEY, path); + subSession.createTopic(topicName, topicConfig); + System.out.println(" Created topic: " + topicName + " (path=" + path + ")"); + } + } + + private static SubscriptionTreePullConsumer createConsumer( + String consumerId, String consumerGroupId) throws Exception { + SubscriptionTreePullConsumer consumer = + new SubscriptionTreePullConsumer.Builder() + .host(HOST) + .port(PORT) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .buildPullConsumer(); + consumer.open(); + return consumer; + } + + // ============================ + // Polling & Verification + // ============================ + + /** + * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive + * empty rounds to verify no extra data arrives. + */ + private static PollResult pollUntilComplete( + SubscriptionTreePullConsumer consumer, int expectedRows, int maxPollAttempts) { + return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true); + } + + private static PollResult pollUntilComplete( + SubscriptionTreePullConsumer consumer, + int expectedRows, + int maxPollAttempts, + long pollTimeoutMs, + boolean commitMessages) { + PollResult result = new PollResult(); + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + + if (messages.isEmpty()) { + consecutiveEmpty++; + // Normal completion: reached expected rows and verified quiescence + if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) { + System.out.println( + " Verified: " + + consecutiveEmpty + + " consecutive empty polls after " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Stuck: have data but cannot reach expected count + if (consecutiveEmpty >= 5 && result.totalRows > 0) { + System.out.println( + " Stuck: " + + consecutiveEmpty + + " consecutive empty polls at " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Never received anything + if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) { + System.out.println(" No data received after " + consecutiveEmpty + " polls"); + break; + } + try { + Thread.sleep(1000); + } catch (InterruptedException ignored) { + } + continue; + } + + consecutiveEmpty = 0; + + for (SubscriptionMessage message : messages) { + for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) { + String device = null; + List columnNames = dataSet.getColumnNames(); + if (columnNames.size() > 1) { + String fullPath = columnNames.get(1); + int lastDot = fullPath.lastIndexOf('.'); + device = lastDot > 0 ? fullPath.substring(0, lastDot) : fullPath; + } + + while (dataSet.hasNext()) { + org.apache.tsfile.read.common.RowRecord record = dataSet.next(); + result.totalRows++; + if (device != null) { + result.rowsPerDevice.merge(device, 1, Integer::sum); + } + for (int i = 1; i < columnNames.size(); i++) { + result.seenColumns.add(columnNames.get(i)); + } + if (result.totalRows <= 5) { + System.out.println( + " Row: time=" + + record.getTimestamp() + + ", values=" + + record.getFields() + + ", device=" + + device); + } + } + } + if (commitMessages) { + consumer.commitSync(message); + } + } + + System.out.println( + " Poll attempt " + + attempt + + ": totalRows=" + + result.totalRows + + " / expected=" + + expectedRows); + + // Stop immediately if we exceeded the expected row count + if (expectedRows > 0 && result.totalRows > expectedRows) { + System.out.println( + " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows); + break; + } + } + + return result; + } + + // ============================ + // Cleanup + // ============================ + + /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */ + private static void cleanup( + SubscriptionTreePullConsumer consumer, String topicName, String database) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopic(topicName); + deleteDatabase(database); + } + + // ============================ + // Result & Assertions + // ============================ + + static class PollResult { + int totalRows = 0; + Map rowsPerDevice = new HashMap<>(); + Set seenColumns = new HashSet<>(); + + @Override + public String toString() { + return "PollResult{totalRows=" + + totalRows + + ", rowsPerDevice=" + + rowsPerDevice + + ", seenColumns=" + + seenColumns + + "}"; + } + } + + private static void assertEquals(String msg, int expected, int actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertTrue(String msg, boolean condition) { + if (!condition) { + throw new AssertionError(msg); + } + } + + private static void assertAtLeast(String msg, int min, int actual) { + if (actual < min) { + throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual); + } + } + + // ============================ + // Test 1: Basic Data Delivery + // ============================ + /** + * Verifies the basic consensus subscription flow: write before subscribe (not received), write + * after subscribe (received), and no extra data beyond expectation. + */ + private static void testBasicDataDelivery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Step 1: Write initial data to create DataRegion + System.out.println(" Step 1: Writing initial data (should NOT be received)"); + try (ISession session = openSession()) { + createDatabase(session, database); + for (int i = 0; i < 50; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)", + database, i, i * 10, i * 1.5)); + } + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 2: Create topic and subscribe + System.out.println(" Step 2: Creating topic and subscribing"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 3: Write new data AFTER subscription + System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)"); + try (ISession session = openSession()) { + for (int i = 100; i < 200; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)", + database, i, i * 10, i * 1.5)); + } + } + Thread.sleep(2000); + + // Step 4: Poll and verify exact count (also verifies no extra data) + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 2: Multiple Data Types (Non-Aligned) + // ============================ + /** + * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using + * separate INSERT statements per type (non-aligned), and verifies all types are delivered. + */ + private static void testMultipleDataTypes() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s_int32) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing data with 6 data types x 20 rows each"); + try (ISession session = openSession()) { + for (int i = 1; i <= 20; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s_int32) VALUES (%d, %d)", database, i, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_int64) VALUES (%d, %d)", + database, i, (long) i * 100000L)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_float) VALUES (%d, %f)", database, i, i * 1.1f)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_double) VALUES (%d, %f)", database, i, i * 2.2)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_bool) VALUES (%d, %s)", + database, i, i % 2 == 0 ? "true" : "false")); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_text) VALUES (%d, 'text_%d')", database, i, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling..."); + PollResult result = pollUntilComplete(consumer, 120, 120); + System.out.println(" Result: " + result); + + assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows); + System.out.println(" Seen columns: " + result.seenColumns); + assertTrue( + "Expected multiple column types in result, got: " + result.seenColumns, + result.seenColumns.size() > 1); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 3: Device-Level Filtering + // ============================ + /** + * Creates a topic that only matches root.db.d1.** and verifies that data written to d2 is NOT + * delivered. + */ + private static void testDeviceLevelFiltering() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + String filterPath = database + ".d1.**"; + createTopic(topicName, filterPath); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to both d1 and d2 (topic filter: d1.** only)"); + try (ISession session = openSession()) { + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting only d1 data)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 50 rows from d1 only", 50, result.totalRows); + if (!result.rowsPerDevice.isEmpty()) { + Integer d2Rows = result.rowsPerDevice.get(database + ".d2"); + assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0); + Integer d1Rows = result.rowsPerDevice.get(database + ".d1"); + assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0); + System.out.println( + " Device filtering verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows"); + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 4: Timeseries-Level Filtering + // ============================ + /** + * Creates a topic matching root.db.d1.s1 only. Tests whether the converter filters at measurement + * level. Lenient: if both s1 and s2 arrive, reports device-level-only filtering. + */ + private static void testTimeseriesLevelFiltering() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1, s2) VALUES (0, 0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + String filterPath = database + ".d1.s1"; + createTopic(topicName, filterPath); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to d1.s1 and d1.s2 (topic filter: d1.s1 only)"); + try (ISession session = openSession()) { + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %d)", + database, i, i * 10, i * 20)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting only s1 data)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + System.out.println(" Seen columns: " + result.seenColumns); + boolean hasS2 = result.seenColumns.stream().anyMatch(c -> c.contains(".s2")); + if (hasS2) { + System.out.println( + " INFO: Both s1 and s2 received — converter uses device-level filtering only."); + assertAtLeast("Should have received some rows", 50, result.totalRows); + } else { + System.out.println(" Timeseries-level filtering verified: only s1 data received"); + assertEquals("Expected exactly 50 rows from s1 only", 50, result.totalRows); + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 5: Subscribe Before Region Creation + // ============================ + /** + * Subscribe BEFORE the database/region exists, then create database and write. Tests the + * IoTConsensus.onNewPeerCreated auto-binding path. + */ + private static void testSubscribeBeforeRegion() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + System.out.println(" Step 1: Creating topic BEFORE database exists"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + System.out.println(" Step 2: Subscribing (no DataRegion exists yet)"); + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Step 3: Creating database and writing data (100 rows)"); + try (ISession session = openSession()) { + createDatabase(session, database); + for (int i = 0; i < 100; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(5000); + + System.out.println(" Step 4: Polling (auto-binding should have picked up new region)..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + if (result.totalRows >= 100) { + System.out.println(" Auto-binding works! All " + result.totalRows + " rows received."); + } else if (result.totalRows > 0) { + System.out.println( + " Partial: " + result.totalRows + "/100 rows. First writes may precede binding."); + } else { + System.out.println(" No data received. Check logs for auto-binding messages."); + } + assertAtLeast( + "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 6: Multiple Devices Aggregation + // ============================ + /** Writes to d1, d2, d3 and verifies all are received via a broad topic path. */ + private static void testMultipleDevicesAggregation() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to 3 devices (d1, d2, d3), 30 rows each"); + try (ISession session = openSession()) { + for (int i = 100; i < 130; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting 90 total from 3 devices)..."); + PollResult result = pollUntilComplete(consumer, 90, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 90 rows total (30 per device)", 90, result.totalRows); + if (!result.rowsPerDevice.isEmpty()) { + System.out.println(" Rows per device: " + result.rowsPerDevice); + for (String dev : new String[] {"d1", "d2", "d3"}) { + Integer devRows = result.rowsPerDevice.get(database + "." + dev); + assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0); + } + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 7: Aligned Timeseries + // ============================ + /** + * Creates aligned timeseries with 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and + * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are + * delivered correctly. + */ + private static void testAlignedTimeseries() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Create aligned timeseries with multiple data types + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format( + "CREATE ALIGNED TIMESERIES %s.d_aligned" + + "(s_int32 INT32, s_int64 INT64, s_float FLOAT," + + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", + database)); + // Write initial row to force DataRegion creation + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')", + database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 aligned rows, each with all 6 data types in a single INSERT + System.out.println(" Writing 50 aligned rows with 6 data types per row"); + try (ISession session = openSession()) { + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, %d, %d, %f, %f, %s, 'text_%d')", + database, + i, + i, + (long) i * 100000L, + i * 1.1f, + i * 2.2, + i % 2 == 0 ? "true" : "false", + i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling..."); + PollResult result = pollUntilComplete(consumer, 50, 70); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 50 aligned rows", 50, result.totalRows); + // Verify we see columns for multiple data types + System.out.println(" Seen columns: " + result.seenColumns); + assertAtLeast( + "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 8: Poll Without Commit (Re-delivery) + // ============================ + /** + * Tests at-least-once delivery with a mixed commit/no-commit pattern. + * + *

Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we + * track committed ROWS (not events). The state machine alternates: + * + *

+ * + *

This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal + * commit path in an interleaved fashion. + */ + private static void testPollWithoutCommit() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 rows (may be batched into fewer events by the prefetching thread) + final int totalRows = 50; + System.out.println(" Writing " + totalRows + " rows"); + try (ISession session = openSession()) { + for (int i = 1; i <= totalRows; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(3000); + + // State machine: alternate between skip-commit and direct-commit. + // Track committed ROWS (not events) because batching is unpredictable. + int totalRowsCommitted = 0; + int roundNumber = 0; // counts distinct events seen (used for alternation) + boolean hasPending = false; + List pendingTimestamps = new ArrayList<>(); // timestamps from the uncommitted event + Set allCommittedTimestamps = new HashSet<>(); // all timestamps ever committed + int redeliveryCount = 0; + + for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(5000)); + if (msgs.isEmpty()) { + Thread.sleep(1000); + continue; + } + + for (SubscriptionMessage msg : msgs) { + // Extract ALL timestamps from this event (may contain multiple rows) + List currentTimestamps = new ArrayList<>(); + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + currentTimestamps.add(ds.next().getTimestamp()); + } + } + assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0); + + if (hasPending) { + // === Re-delivery round: verify EXACT same timestamps === + assertTrue( + "Re-delivery timestamp list mismatch: expected=" + + pendingTimestamps + + ", actual=" + + currentTimestamps, + currentTimestamps.equals(pendingTimestamps)); + consumer.commitSync(msg); + totalRowsCommitted += currentTimestamps.size(); + allCommittedTimestamps.addAll(currentTimestamps); + hasPending = false; + redeliveryCount++; + roundNumber++; + System.out.println( + " [rows=" + + totalRowsCommitted + + "/" + + totalRows + + "] Re-delivered & committed: timestamps=" + + currentTimestamps); + } else { + // === New event round === + // After a commit, verify this is DIFFERENT data (no overlap with committed set) + if (totalRowsCommitted > 0) { + boolean overlap = false; + for (Long ts : currentTimestamps) { + if (allCommittedTimestamps.contains(ts)) { + overlap = true; + break; + } + } + assertTrue( + "After commit, should receive different data (timestamps=" + + currentTimestamps + + " overlap with committed=" + + allCommittedTimestamps + + ")", + !overlap); + } + + // Even-numbered rounds: skip commit (test re-delivery) + // Odd-numbered rounds: commit directly (test normal flow) + if (roundNumber % 2 == 0) { + pendingTimestamps = new ArrayList<>(currentTimestamps); + hasPending = true; + System.out.println( + " [rows=" + + totalRowsCommitted + + "/" + + totalRows + + "] New event (NOT committed): timestamps=" + + currentTimestamps); + } else { + consumer.commitSync(msg); + totalRowsCommitted += currentTimestamps.size(); + allCommittedTimestamps.addAll(currentTimestamps); + roundNumber++; + System.out.println( + " [rows=" + + totalRowsCommitted + + "/" + + totalRows + + "] New event (committed directly): timestamps=" + + currentTimestamps); + } + } + } + } + + assertEquals("Should have committed all rows", totalRows, totalRowsCommitted); + assertTrue( + "Should have at least 1 re-delivery round (got " + redeliveryCount + ")", + redeliveryCount > 0); + + // Final poll: should be empty + System.out.println(" Final poll: expecting no data"); + int extraRows = 0; + for (int i = 0; i < 3; i++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + for (SubscriptionMessage msg : msgs) { + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + extraRows++; + } + } + } + } + assertEquals("After all committed, should receive no more data", 0, extraRows); + + System.out.println( + " At-least-once re-delivery verified: " + + totalRows + + " rows committed with " + + redeliveryCount + + " re-delivery rounds"); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 9: Multi Consumer Group Independent Consumption + // ============================ + /** + * Two consumer groups subscribe to the same topic. Verifies that each group independently + * receives ALL data (data is not partitioned/split between groups). + */ + private static void testMultiConsumerGroupIndependent() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId1 = "cg_multi_" + testCounter + "_a"; + String consumerId1 = "consumer_multi_" + testCounter + "_a"; + String consumerGroupId2 = "cg_multi_" + testCounter + "_b"; + String consumerId2 = "consumer_multi_" + testCounter + "_b"; + SubscriptionTreePullConsumer consumer1 = null; + SubscriptionTreePullConsumer consumer2 = null; + + try { + // Create database and initial data + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + // Two consumers in different groups both subscribe to the same topic + consumer1 = createConsumer(consumerId1, consumerGroupId1); + consumer1.subscribe(topicName); + consumer2 = createConsumer(consumerId2, consumerGroupId2); + consumer2.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 rows + System.out.println(" Writing 50 rows"); + try (ISession session = openSession()) { + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(2000); + + // Poll from group 1 + System.out.println(" Polling from consumer group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 50, 70); + System.out.println(" Group 1 result: " + result1); + + // Poll from group 2 + System.out.println(" Polling from consumer group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 50, 70); + System.out.println(" Group 2 result: " + result2); + + // Both groups should have all 50 rows + assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows); + assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows); + System.out.println( + " Independent consumption verified: group1=" + + result1.totalRows + + ", group2=" + + result2.totalRows); + } finally { + // Clean up both consumers + if (consumer1 != null) { + try { + consumer1.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer1.close(); + } catch (Exception e) { + // ignore + } + } + if (consumer2 != null) { + try { + consumer2.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer2.close(); + } catch (Exception e) { + // ignore + } + } + dropTopic(topicName); + deleteDatabase(database); + } + } + + // ============================ + // Test 10: Multi Topic Subscription + // ============================ + /** + * One consumer subscribes to two different topics with different path filters. Verifies that each + * topic delivers only its matching data, and no cross-contamination occurs. + */ + private static void testMultiTopicSubscription() throws Exception { + String database = nextDatabase(); + String topicName1 = "topic_multi_" + testCounter + "_a"; + String topicName2 = "topic_multi_" + testCounter + "_b"; + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Create database with two device groups + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic 1: covers d1 only + createTopic(topicName1, database + ".d1.**"); + // Topic 2: covers d2 only + createTopic(topicName2, database + ".d2.**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName1, topicName2); + Thread.sleep(3000); + + // Write 30 rows to d1 and 40 rows to d2 + System.out.println(" Writing 30 rows to d1, 40 rows to d2"); + try (ISession session = openSession()) { + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + } + } + Thread.sleep(2000); + + // Poll all data — should get d1 rows (via topic1) + d2 rows (via topic2) + System.out.println(" Polling (expecting 30 from d1 + 40 from d2 = 70 total)..."); + PollResult result = pollUntilComplete(consumer, 70, 80); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 70 rows total (30 d1 + 40 d2)", 70, result.totalRows); + if (!result.rowsPerDevice.isEmpty()) { + Integer d1Rows = result.rowsPerDevice.get(database + ".d1"); + Integer d2Rows = result.rowsPerDevice.get(database + ".d2"); + assertEquals("Expected 30 rows from d1", 30, d1Rows != null ? d1Rows : 0); + assertEquals("Expected 40 rows from d2", 40, d2Rows != null ? d2Rows : 0); + System.out.println( + " Multi-topic isolation verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows"); + } + } finally { + // Clean up consumer, both topics, and database + if (consumer != null) { + try { + consumer.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopic(topicName1); + dropTopic(topicName2); + deleteDatabase(database); + } + } + + // ============================ + // Test 11: Flush Data Delivery + // ============================ + /** + * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable + * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps + * entries available until committed by the subscription consumer. + */ + private static void testFlushDataDelivery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write 50 rows, then flush before polling + System.out.println(" Writing 50 rows then flushing"); + try (ISession session = openSession()) { + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Poll — all 50 rows should be delivered despite flush + System.out.println(" Polling after flush..."); + PollResult result = pollUntilComplete(consumer, 50, 70); + System.out.println(" Result: " + result); + assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ============================ + // Test 12: Cross-Partition Aligned Timeseries (Multiple Write Methods) + // ============================ + /** + * Tests cross-partition aligned timeseries with 6 data types, written via six different aligned + * methods. Timestamps are spaced >1 week apart to force different time partitions, exercising the + * WAL merge path for multi-partition inserts. + * + *

Write methods (all aligned): + * + *

    + *
  1. SQL single row + *
  2. SQL multi-row (cross-partition) + *
  3. session.insertAlignedRecord (single row) + *
  4. session.insertAlignedRecordsOfOneDevice (cross-partition) + *
  5. session.insertAlignedTablet (cross-partition) + *
  6. session.insertAlignedTablets (cross-partition) + *
+ */ + private static void testCrossPartitionAligned() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + // Gap slightly over 1 week (default partition interval = 604,800,000ms) + final long GAP = 604_800_001L; + final String device = database + ".d_aligned"; + + try { + // Create aligned timeseries with 6 data types + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format( + "CREATE ALIGNED TIMESERIES %s.d_aligned" + + "(s_int32 INT32, s_int64 INT64, s_float FLOAT," + + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", + database)); + // Init row to force DataRegion creation + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')", + database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Shared measurement info for Session API calls + List measurements = + Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text"); + List types = + Arrays.asList( + TSDataType.INT32, + TSDataType.INT64, + TSDataType.FLOAT, + TSDataType.DOUBLE, + TSDataType.BOOLEAN, + TSDataType.TEXT); + + // Shared schema for Tablet API calls + List schemas = new ArrayList<>(); + schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT)); + + System.out.println(" Writing cross-partition aligned data via 6 methods"); + int totalExpected = 0; + + try (ISession session = openSession()) { + + // --- Method 1: SQL single row --- + long t1 = 1; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')", + database, t1)); + totalExpected += 1; + System.out.println(" Method 1 (SQL single row): 1 row"); + + // --- Method 2: SQL multi-row (cross-partition, 2 rows >1 week apart) --- + long t2a = 1 + GAP; + long t2b = 1 + 2 * GAP; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a')," + + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')", + database, t2a, t2b)); + totalExpected += 2; + System.out.println(" Method 2 (SQL multi-row, cross-partition): 2 rows"); + + // --- Method 3: insertAlignedRecord (single row) --- + long t3 = 1 + 3 * GAP; + List values3 = Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single"); + session.insertAlignedRecord(device, t3, measurements, types, values3); + totalExpected += 1; + System.out.println(" Method 3 (insertAlignedRecord): 1 row"); + + // --- Method 4: insertAlignedRecordsOfOneDevice (cross-partition, 2 rows) --- + long t4a = 1 + 4 * GAP; + long t4b = 1 + 5 * GAP; + session.insertAlignedRecordsOfOneDevice( + device, + Arrays.asList(t4a, t4b), + Arrays.asList(measurements, measurements), + Arrays.asList(types, types), + Arrays.asList( + Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"), + Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b"))); + totalExpected += 2; + System.out.println( + " Method 4 (insertAlignedRecordsOfOneDevice, cross-partition): 2 rows"); + + // --- Method 5: insertAlignedTablet (cross-partition, 2 rows) --- + long t5a = 1 + 6 * GAP; + long t5b = 1 + 7 * GAP; + Tablet tablet5 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a"); + addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b"); + session.insertAlignedTablet(tablet5); + totalExpected += 2; + System.out.println(" Method 5 (insertAlignedTablet, cross-partition): 2 rows"); + + // --- Method 6: insertAlignedTablets (cross-partition, 2 rows) --- + long t6a = 1 + 8 * GAP; + long t6b = 1 + 9 * GAP; + Tablet tablet6 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a"); + addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b"); + Map tabletMap = new HashMap<>(); + tabletMap.put(device, tablet6); + session.insertAlignedTablets(tabletMap); + totalExpected += 2; + System.out.println(" Method 6 (insertAlignedTablets, cross-partition): 2 rows"); + } + + System.out.println(" Total expected rows: " + totalExpected); + Thread.sleep(2000); + + System.out.println(" Polling..."); + PollResult result = pollUntilComplete(consumer, totalExpected, 100); + System.out.println(" Result: " + result); + + assertEquals( + "Expected exactly " + totalExpected + " cross-partition aligned rows", + totalExpected, + result.totalRows); + assertAtLeast( + "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + /** Helper: populate one row of an aligned Tablet with all 6 data types. */ + private static void addAlignedTabletRow( + Tablet tablet, + int rowIndex, + long timestamp, + int intVal, + long longVal, + float floatVal, + double doubleVal, + boolean boolVal, + String textVal) { + tablet.addTimestamp(rowIndex, timestamp); + tablet.addValue("s_int32", rowIndex, intVal); + tablet.addValue("s_int64", rowIndex, longVal); + tablet.addValue("s_float", rowIndex, floatVal); + tablet.addValue("s_double", rowIndex, doubleVal); + tablet.addValue("s_bool", rowIndex, boolVal); + tablet.addValue("s_text", rowIndex, new Binary(textVal, TSFileConfig.STRING_CHARSET)); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java index cb5edd8cd91a3..6b71d5b16f79a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java @@ -39,6 +39,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -52,6 +53,7 @@ import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.stream.Collectors; public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndPipeProcedure { @@ -66,6 +68,8 @@ public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndP private AlterConsumerGroupProcedure alterConsumerGroupProcedure; private List createPipeProcedures = new ArrayList<>(); + private Set consensusTopicNames = new HashSet<>(); + // TODO: remove this variable later private final List alterTopicProcedures = new ArrayList<>(); // unused now @@ -103,15 +107,41 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) alterConsumerGroupProcedure = new AlterConsumerGroupProcedure(updatedConsumerGroupMeta, subscriptionInfo); - // Construct CreatePipeProcedureV2s + // Construct CreatePipeProcedureV2s (for non-consensus topics) for (final String topicName : subscribeReq.getTopicNames()) { + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); + + // Check if this topic should use consensus subscription: mode is live, format is Tablet + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + final boolean isConsensusBasedTopic = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + + if (isConsensusBasedTopic) { + // skip pipe creation + consensusTopicNames.add(topicName); + LOGGER.info( + "CreateSubscriptionProcedure: topic [{}] uses consensus-based subscription " + + "(mode={}, format={}), skipping pipe creation", + topicName, + topicMode, + topicFormat); + continue; + } + final String pipeName = PipeStaticMeta.generateSubscriptionPipeName(topicName, consumerGroupId); if (!subscriptionInfo.get().isTopicSubscribedByConsumerGroup(topicName, consumerGroupId) // even if there existed subscription meta, if there is no corresponding pipe meta, it // will try to create the pipe || !pipeTaskInfo.get().isPipeExisted(pipeName)) { - final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); createPipeProcedures.add( new CreatePipeProcedureV2( new TCreatePipeReq() @@ -177,20 +207,29 @@ protected void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) // Push consumer group meta to data nodes alterConsumerGroupProcedure.executeFromOperateOnDataNodes(env); - // Push pipe meta to data nodes - final List pipeNames = - createPipeProcedures.stream() - .map(CreatePipeProcedureV2::getPipeName) - .collect(Collectors.toList()); - final String exceptionMessage = - AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( - null, pushMultiPipeMetaToDataNodes(pipeNames, env)); - if (!exceptionMessage.isEmpty()) { - // throw exception instead of logging warn, do not rely on metadata synchronization - throw new SubscriptionException( - String.format( - "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", - pipeNames, subscribeReq, exceptionMessage)); + if (!consensusTopicNames.isEmpty()) { + LOGGER.info( + "CreateSubscriptionProcedure: consensus-based topics {} will be handled by DataNode " + + "via consumer group meta push (no pipe creation needed)", + consensusTopicNames); + } + + // Push pipe meta to data nodes (only for non-consensus pipe-based topics) + if (!createPipeProcedures.isEmpty()) { + final List pipeNames = + createPipeProcedures.stream() + .map(CreatePipeProcedureV2::getPipeName) + .collect(Collectors.toList()); + final String exceptionMessage = + AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( + null, pushMultiPipeMetaToDataNodes(pipeNames, env)); + if (!exceptionMessage.isEmpty()) { + // throw exception instead of logging warn, do not rely on metadata synchronization + throw new SubscriptionException( + String.format( + "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", + pipeNames, subscribeReq, exceptionMessage)); + } } } @@ -297,6 +336,12 @@ public void serialize(final DataOutputStream stream) throws IOException { } else { ReadWriteIOUtils.write(false, stream); } + + // Serialize consensus topic names + ReadWriteIOUtils.write(consensusTopicNames.size(), stream); + for (final String consensusTopicName : consensusTopicNames) { + ReadWriteIOUtils.write(consensusTopicName, stream); + } } @Override @@ -348,6 +393,14 @@ public void deserialize(final ByteBuffer byteBuffer) { } } } + + // Deserialize consensus topic names + if (byteBuffer.hasRemaining()) { + size = ReadWriteIOUtils.readInt(byteBuffer); + for (int i = 0; i < size; ++i) { + consensusTopicNames.add(ReadWriteIOUtils.readString(byteBuffer)); + } + } } @Override @@ -364,7 +417,8 @@ public boolean equals(final Object o) { && getCycles() == that.getCycles() && Objects.equals(subscribeReq, that.subscribeReq) && Objects.equals(alterConsumerGroupProcedure, that.alterConsumerGroupProcedure) - && Objects.equals(createPipeProcedures, that.createPipeProcedures); + && Objects.equals(createPipeProcedures, that.createPipeProcedures) + && Objects.equals(consensusTopicNames, that.consensusTopicNames); } @Override @@ -375,7 +429,8 @@ public int hashCode() { getCycles(), subscribeReq, alterConsumerGroupProcedure, - createPipeProcedures); + createPipeProcedures, + consensusTopicNames); } @TestOnly diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java index 6741a6c1e2a84..99f8ed649d852 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java @@ -22,6 +22,7 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.pipe.agent.task.meta.PipeStaticMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.task.DropPipePlanV2; @@ -36,6 +37,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -100,6 +102,31 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) for (final String topic : unsubscribeReq.getTopicNames()) { if (topicsUnsubByGroup.contains(topic)) { + // Check if this topic uses consensus-based subscription (same detection as + // CreateSubscriptionProcedure). Consensus topics have no pipe to drop. + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topic); + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + final boolean isConsensusBasedTopic = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + + if (isConsensusBasedTopic) { + LOGGER.info( + "DropSubscriptionProcedure: topic [{}] is consensus-based (mode={}, format={}), " + + "skipping pipe removal", + topic, + topicMode, + topicFormat); + continue; + } + // Topic will be subscribed by no consumers in this group dropPipeProcedures.add( new DropPipeProcedureV2( diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java index 959191ca2d6d3..c494ae05d01b0 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java @@ -98,6 +98,13 @@ public class IoTConsensus implements IConsensus { private final IoTConsensusRPCService service; private final RegisterManager registerManager = new RegisterManager(); private IoTConsensusConfig config; + + /** + * Optional callback invoked after a new local peer is created via {@link #createLocalPeer}. Used + * by the subscription system to auto-bind prefetching queues to new DataRegions. + */ + public static volatile BiConsumer onNewPeerCreated; + private final IClientManager clientManager; private final IClientManager syncClientManager; private final ScheduledExecutorService backgroundTaskService; @@ -299,6 +306,16 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers) if (exist.get()) { throw new ConsensusGroupAlreadyExistException(groupId); } + + // Notify subscription system about new peer creation for auto-binding + final BiConsumer callback = onNewPeerCreated; + if (callback != null) { + try { + callback.accept(groupId, stateMachineMap.get(groupId)); + } catch (final Exception e) { + logger.warn("onNewPeerCreated callback failed for group {}", groupId, e); + } + } } @Override diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java index 567261efffffa..bb5d4aa603417 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java @@ -89,13 +89,16 @@ import java.util.PriorityQueue; import java.util.TreeSet; import java.util.UUID; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; +import java.util.function.LongSupplier; import java.util.regex.Pattern; import static org.apache.iotdb.commons.utils.FileUtils.humanReadableByteCountSI; @@ -128,6 +131,14 @@ public class IoTConsensusServerImpl { IoTConsensusRateLimiter.getInstance(); private IndexedConsensusRequest lastConsensusRequest; + // Subscription queues receive IndexedConsensusRequest in real-time from write(), + // similar to LogDispatcher, enabling in-memory data delivery without waiting for WAL flush. + private final List> subscriptionQueues = + new CopyOnWriteArrayList<>(); + // Suppliers that report each subscription consumer's acknowledged search index. + // Used to pin WAL files: entries >= min(suppliers) cannot be deleted. + private final List subscriptionSyncIndexSuppliers = new CopyOnWriteArrayList<>(); + public IoTConsensusServerImpl( String storageDir, Peer thisNode, @@ -236,6 +247,44 @@ public TSStatus write(IConsensusRequest request) { // in one transaction. synchronized (searchIndex) { logDispatcher.offer(indexedConsensusRequest); + // Deliver to subscription queues for real-time in-memory consumption. + // Offer AFTER stateMachine.write() so that InsertNode has inferred types + // and properly typed values (same timing as LogDispatcher). + final int sqCount = subscriptionQueues.size(); + if (sqCount > 0) { + logger.debug( + "write() offering to {} subscription queue(s), " + + "group={}, searchIndex={}, requestType={}", + sqCount, + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + indexedConsensusRequest.getRequests().isEmpty() + ? "EMPTY" + : indexedConsensusRequest.getRequests().get(0).getClass().getSimpleName()); + for (final BlockingQueue sq : subscriptionQueues) { + final boolean offered = sq.offer(indexedConsensusRequest); + logger.debug( + "offer result={}, queueSize={}, queueRemaining={}", + offered, + sq.size(), + sq.remainingCapacity()); + if (!offered) { + logger.warn( + "Subscription queue full, dropped entry searchIndex={}", + indexedConsensusRequest.getSearchIndex()); + } + } + } else { + // Log periodically when no subscription queues are registered + if (indexedConsensusRequest.getSearchIndex() % 50 == 0) { + logger.debug( + "write() no subscription queues registered, " + + "group={}, searchIndex={}, this={}", + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + System.identityHashCode(this)); + } + } searchIndex.incrementAndGet(); } // statistic the time of offering request into queue @@ -243,10 +292,13 @@ public TSStatus write(IConsensusRequest request) { System.nanoTime() - writeToStateMachineEndTime); } else { logger.debug( - "{}: write operation failed. searchIndex: {}. Code: {}", + "write operation FAILED. group={}, searchIndex={}, code={}, " + + "subscriptionQueues={}, this={}", thisNode.getGroupId(), indexedConsensusRequest.getSearchIndex(), - result.getCode()); + result.getCode(), + subscriptionQueues.size(), + System.identityHashCode(this)); } // statistic the time of total write process ioTConsensusServerMetrics.recordConsensusWriteTime( @@ -757,6 +809,47 @@ public long getSearchIndex() { return searchIndex.get(); } + public ConsensusReqReader getConsensusReqReader() { + return consensusReqReader; + } + + /** + * Registers a subscription pending queue for real-time in-memory data delivery. When {@link + * #write(IConsensusRequest)} succeeds, the IndexedConsensusRequest is offered to all registered + * subscription queues, enabling subscription consumers to receive data without waiting for WAL + * flush. + * + * @param queue the blocking queue to receive IndexedConsensusRequest entries + * @param syncIndexSupplier supplies the subscription consumer's current acknowledged search + * index, used by WAL pinning to prevent deletion of unacknowledged entries + */ + public void registerSubscriptionQueue( + final BlockingQueue queue, final LongSupplier syncIndexSupplier) { + subscriptionQueues.add(queue); + subscriptionSyncIndexSuppliers.add(syncIndexSupplier); + // Immediately re-evaluate the safe delete index to protect WAL for this subscriber + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Registered subscription queue for group {}, " + + "total subscription queues: {}, currentSearchIndex={}, this={}", + consensusGroupId, + subscriptionQueues.size(), + searchIndex.get(), + System.identityHashCode(this)); + } + + public void unregisterSubscriptionQueue( + final BlockingQueue queue, final LongSupplier syncIndexSupplier) { + subscriptionQueues.remove(queue); + subscriptionSyncIndexSuppliers.remove(syncIndexSupplier); + // Re-evaluate: with fewer subscribers, more WAL may be deletable + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Unregistered subscription queue for group {}, remaining subscription queues: {}", + consensusGroupId, + subscriptionQueues.size()); + } + public long getSyncLag() { long minSyncIndex = getMinSyncIndex(); return getSearchIndex() - minSyncIndex; @@ -879,10 +972,25 @@ void checkAndUpdateSafeDeletedSearchIndex() { if (configuration.isEmpty()) { logger.error( "Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time."); - } else if (configuration.size() == 1) { + return; + } + + // Compute the minimum search index that subscription consumers still need. + // WAL entries at or after this index must be preserved. + long minSubscriptionIndex = Long.MAX_VALUE; + for (final LongSupplier supplier : subscriptionSyncIndexSuppliers) { + minSubscriptionIndex = Math.min(minSubscriptionIndex, supplier.getAsLong()); + } + + if (configuration.size() == 1 && subscriptionSyncIndexSuppliers.isEmpty()) { + // Single replica, no subscription consumers => delete all WAL freely consensusReqReader.setSafelyDeletedSearchIndex(Long.MAX_VALUE); } else { - consensusReqReader.setSafelyDeletedSearchIndex(getMinFlushedSyncIndex()); + // min(replication progress, subscription progress) — preserve WAL for both + final long replicationIndex = + configuration.size() > 1 ? getMinFlushedSyncIndex() : Long.MAX_VALUE; + consensusReqReader.setSafelyDeletedSearchIndex( + Math.min(replicationIndex, minSubscriptionIndex)); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java index 510f8559bc147..220ad3e449951 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java @@ -19,7 +19,11 @@ package org.apache.iotdb.db.subscription.agent; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.subscription.broker.ConsensusSubscriptionBroker; import org.apache.iotdb.db.subscription.broker.SubscriptionBroker; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager; import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask; @@ -30,6 +34,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -43,7 +49,12 @@ public class SubscriptionBrokerAgent { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBrokerAgent.class); - private final Map consumerGroupIdToSubscriptionBroker = + /** Pipe-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToPipeBroker = + new ConcurrentHashMap<>(); + + /** Consensus-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToConsensusBroker = new ConcurrentHashMap<>(); private final Cache prefetchingQueueCount = @@ -54,17 +65,54 @@ public class SubscriptionBrokerAgent { public List poll( final ConsumerConfig consumerConfig, final Set topicNames, final long maxBytes) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List allEvents = new ArrayList<>(); + long remainingBytes = maxBytes; + + // Poll from pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.nonNull(pipeBroker)) { + final List pipeEvents = + pipeBroker.poll(consumerId, topicNames, remainingBytes); + allEvents.addAll(pipeEvents); + for (final SubscriptionEvent event : pipeEvents) { + try { + remainingBytes -= event.getCurrentResponseSize(); + } catch (final IOException ignored) { + // best effort + } + } + } + + // Poll from consensus-based broker + if (remainingBytes > 0) { + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker)) { + LOGGER.debug( + "SubscriptionBrokerAgent: polling consensus broker for consumer group [{}], " + + "topicNames={}, remainingBytes={}", + consumerGroupId, + topicNames, + remainingBytes); + allEvents.addAll(consensusBroker.poll(consumerId, topicNames, remainingBytes)); + } else { + LOGGER.debug( + "SubscriptionBrokerAgent: no consensus broker for consumer group [{}]", + consumerGroupId); + } + } + + if (allEvents.isEmpty() + && Objects.isNull(pipeBroker) + && Objects.isNull(consumerGroupIdToConsensusBroker.get(consumerGroupId))) { final String errorMessage = - String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - // TODO: currently we fetch messages from all topics - final String consumerId = consumerConfig.getConsumerId(); - return broker.poll(consumerId, topicNames, maxBytes); + + return allEvents; } public List pollTsFile( @@ -72,16 +120,18 @@ public List pollTsFile( final SubscriptionCommitContext commitContext, final long writingOffset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // TsFile polling can only be called by pipe-based subscriptions + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + "Subscription: pipe broker bound to consumer group [%s] does not exist", + consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTsFile(consumerId, commitContext, writingOffset); + return pipeBroker.pollTsFile(consumerId, commitContext, writingOffset); } public List pollTablets( @@ -89,16 +139,26 @@ public List pollTablets( final SubscriptionCommitContext commitContext, final int offset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final String topicName = commitContext.getTopicName(); + + // Try consensus-based broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.pollTablets(consumerId, commitContext, offset); + } + + // Fall back to pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTablets(consumerId, commitContext, offset); + return pipeBroker.pollTablets(consumerId, commitContext, offset); } /** @@ -109,46 +169,98 @@ public List commit( final List commitContexts, final boolean nack) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List allSuccessful = new ArrayList<>(); + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + + if (Objects.isNull(pipeBroker) && Objects.isNull(consensusBroker)) { final String errorMessage = - String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.commit(consumerId, commitContexts, nack); + + // Partition commit contexts by which broker owns the topic. + final List pipeContexts = new ArrayList<>(); + final List consensusContexts = new ArrayList<>(); + for (final SubscriptionCommitContext ctx : commitContexts) { + final String topicName = ctx.getTopicName(); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusContexts.add(ctx); + } else { + pipeContexts.add(ctx); + } + } + + if (Objects.nonNull(pipeBroker) && !pipeContexts.isEmpty()) { + allSuccessful.addAll(pipeBroker.commit(consumerId, pipeContexts, nack)); + } + if (Objects.nonNull(consensusBroker) && !consensusContexts.isEmpty()) { + allSuccessful.addAll(consensusBroker.commit(consumerId, consensusContexts, nack)); + } + + return allSuccessful; } public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String consumerGroupId = commitContext.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String topicName = commitContext.getTopicName(); + + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.isCommitContextOutdated(commitContext); + } + + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { return true; } - return broker.isCommitContextOutdated(commitContext); + return pipeBroker.isCommitContextOutdated(commitContext); } public List fetchTopicNamesToUnsubscribe( final ConsumerConfig consumerConfig, final Set topicNames) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + + // Consensus-based subscription topics are unbounded streams, so they do not trigger + // auto-unsubscribe. + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + final Set pipeOnlyTopicNames; + if (Objects.nonNull(consensusBroker)) { + pipeOnlyTopicNames = new java.util.HashSet<>(topicNames); + pipeOnlyTopicNames.removeIf(consensusBroker::hasQueue); + } else { + pipeOnlyTopicNames = topicNames; + } + + if (pipeOnlyTopicNames.isEmpty()) { + return Collections.emptyList(); + } + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { return Collections.emptyList(); } - return broker.fetchTopicNamesToUnsubscribe(topicNames); + return pipeBroker.fetchTopicNamesToUnsubscribe(pipeOnlyTopicNames); } /////////////////////////////// broker /////////////////////////////// public boolean isBrokerExist(final String consumerGroupId) { - return consumerGroupIdToSubscriptionBroker.containsKey(consumerGroupId); + return consumerGroupIdToPipeBroker.containsKey(consumerGroupId) + || consumerGroupIdToConsensusBroker.containsKey(consumerGroupId); } public void createBrokerIfNotExist(final String consumerGroupId) { - consumerGroupIdToSubscriptionBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); - LOGGER.info("Subscription: create broker bound to consumer group [{}]", consumerGroupId); + consumerGroupIdToPipeBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); + LOGGER.info("Subscription: create pipe broker bound to consumer group [{}]", consumerGroupId); } /** @@ -156,26 +268,46 @@ public void createBrokerIfNotExist(final String consumerGroupId) { */ public boolean dropBroker(final String consumerGroupId) { final AtomicBoolean dropped = new AtomicBoolean(false); - consumerGroupIdToSubscriptionBroker.compute( + + // Drop pipe broker + consumerGroupIdToPipeBroker.compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { + dropped.set(true); + return null; + } + if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", + "Subscription: pipe broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); - dropped.set(true); + return broker; + } + dropped.set(true); + LOGGER.info( + "Subscription: drop pipe broker bound to consumer group [{}]", consumerGroupId); + return null; + }); + + // Drop consensus broker + consumerGroupIdToConsensusBroker.compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { return null; } if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] is not empty when dropping", + "Subscription: consensus broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); return broker; } dropped.set(true); - LOGGER.info("Subscription: drop broker bound to consumer group [{}]", consumerGroupId); - return null; // remove this entry + LOGGER.info( + "Subscription: drop consensus broker bound to consumer group [{}]", consumerGroupId); + return null; }); + return dropped.get(); } @@ -183,15 +315,14 @@ public boolean dropBroker(final String consumerGroupId) { public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { final String consumerGroupId = subtask.getConsumerGroupId(); - consumerGroupIdToSubscriptionBroker + consumerGroupIdToPipeBroker .compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { LOGGER.info( - "Subscription: broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", + "Subscription: pipe broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", consumerGroupId); - // TODO: consider more robust metadata semantics return new SubscriptionBroker(consumerGroupId); } return broker; @@ -200,41 +331,105 @@ public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { prefetchingQueueCount.invalidate(); } - public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); + public void bindConsensusPrefetchingQueue( + final String consumerGroupId, + final String topicName, + final String consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final long startSearchIndex) { + consumerGroupIdToConsensusBroker + .compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { + LOGGER.info( + "Subscription: consensus broker bound to consumer group [{}] does not exist, create new for binding consensus prefetching queue", + consumerGroupId); + return new ConsensusSubscriptionBroker(consumerGroupId); + } + return broker; + }) + .bindConsensusPrefetchingQueue( + topicName, consensusGroupId, serverImpl, converter, commitManager, startSearchIndex); + prefetchingQueueCount.invalidate(); + } + + public void unbindConsensusPrefetchingQueue( + final String consumerGroupId, final String topicName) { + final ConsensusSubscriptionBroker broker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); if (Objects.isNull(broker)) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); + "Subscription: consensus broker bound to consumer group [{}] does not exist", + consumerGroupId); return; } - broker.updateCompletedTopicNames(topicName); + broker.unbindConsensusPrefetchingQueue(topicName); + prefetchingQueueCount.invalidate(); + } + + public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { + LOGGER.warn( + "Subscription: pipe broker bound to consumer group [{}] does not exist", consumerGroupId); + return; + } + pipeBroker.updateCompletedTopicNames(topicName); } public void unbindPrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.removeQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.unbindPrefetchingQueue(topicName); + pipeBroker.unbindPrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public void removePrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.removeQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.removePrefetchingQueue(topicName); + pipeBroker.removePrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public boolean executePrefetch(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.executePrefetch(topicName); + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { SubscriptionDataNodeResourceManager.log() .schedule(SubscriptionBrokerAgent.class, consumerGroupId, topicName) .ifPresent( @@ -244,17 +439,24 @@ public boolean executePrefetch(final String consumerGroupId, final String topicN consumerGroupId)); return false; } - return broker.executePrefetch(topicName); + return pipeBroker.executePrefetch(topicName); } public int getPipeEventCount(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.getEventCount(topicName); + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return 0; } - return broker.getPipeEventCount(topicName); + return pipeBroker.getPipeEventCount(topicName); } public int getPrefetchingQueueCount() { @@ -262,9 +464,15 @@ public int getPrefetchingQueueCount() { } private int getPrefetchingQueueCountInternal() { - return consumerGroupIdToSubscriptionBroker.values().stream() - .map(SubscriptionBroker::getPrefetchingQueueCount) - .reduce(0, Integer::sum); + int count = + consumerGroupIdToPipeBroker.values().stream() + .map(SubscriptionBroker::getPrefetchingQueueCount) + .reduce(0, Integer::sum); + count += + consumerGroupIdToConsensusBroker.values().stream() + .map(ConsensusSubscriptionBroker::getQueueCount) + .reduce(0, Integer::sum); + return count; } /////////////////////////////// Cache /////////////////////////////// @@ -272,14 +480,15 @@ private int getPrefetchingQueueCountInternal() { /** * A simple generic cache that computes and stores a value on demand. * - *

Note that since the get() and invalidate() methods are not modified with synchronized, the - * value obtained may not be entirely accurate. + *

Both {@code value} and {@code valid} are volatile to ensure visibility across threads. The + * {@code get()} method uses a local snapshot of {@code valid} to avoid double-read reordering. + * Concurrent recomputation by multiple threads is benign (idempotent supplier). * * @param the type of the cached value */ private static class Cache { - private T value; + private volatile T value; private volatile boolean valid = false; private final Supplier supplier; @@ -304,8 +513,10 @@ private void invalidate() { */ private T get() { if (!valid) { - value = supplier.get(); + final T computed = supplier.get(); + value = computed; valid = true; + return computed; } return value; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java index fee23cf6af4cb..9c54497b6f468 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; @@ -132,11 +133,34 @@ private void handleSingleConsumerGroupMetaChangesInternal( for (final String topicName : topicsUnsubByGroup) { SubscriptionAgent.broker().removePrefetchingQueue(consumerGroupId, topicName); } + // Tear down consensus-based subscriptions for unsubscribed topics + if (!topicsUnsubByGroup.isEmpty()) { + ConsensusSubscriptionSetupHandler.teardownConsensusSubscriptions( + consumerGroupId, topicsUnsubByGroup); + } + + // Detect newly subscribed topics (present in new meta but not in old meta) + final Set newlySubscribedTopics = + ConsumerGroupMeta.getTopicsNewlySubByGroup(metaInAgent, metaFromCoordinator); + + LOGGER.info( + "Subscription: consumer group [{}] meta change detected, " + + "topicsUnsubByGroup={}, newlySubscribedTopics={}", + consumerGroupId, + topicsUnsubByGroup, + newlySubscribedTopics); // TODO: Currently we fully replace the entire ConsumerGroupMeta without carefully checking the // changes in its fields. consumerGroupMetaKeeper.removeConsumerGroupMeta(consumerGroupId); consumerGroupMetaKeeper.addConsumerGroupMeta(consumerGroupId, metaFromCoordinator); + + // Set up consensus-based subscription for newly subscribed live-mode topics. + // This must happen after the meta is updated so that the broker can find the topic config. + if (!newlySubscribedTopics.isEmpty()) { + ConsensusSubscriptionSetupHandler.handleNewSubscriptions( + consumerGroupId, newlySubscribedTopics); + } } public TPushConsumerGroupMetaRespExceptionMessage handleConsumerGroupMetaChanges( @@ -222,4 +246,24 @@ public Set getTopicNamesSubscribedByConsumer( releaseReadLock(); } } + + /** + * Get all active subscriptions: consumerGroupId → set of subscribed topic names. Used by + * consensus subscription auto-binding when a new DataRegion is created. + */ + public java.util.Map> getAllSubscriptions() { + acquireReadLock(); + try { + final java.util.Map> result = new java.util.HashMap<>(); + for (final ConsumerGroupMeta meta : consumerGroupMetaKeeper.getAllConsumerGroupMeta()) { + final Set topics = meta.getSubscribedTopicNames(); + if (!topics.isEmpty()) { + result.put(meta.getConsumerGroupId(), new java.util.HashSet<>(topics)); + } + } + return result; + } finally { + releaseReadLock(); + } + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java new file mode 100644 index 0000000000000..84d89ef9a8f39 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java @@ -0,0 +1,368 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +/** + * Consensus-based subscription broker that reads data directly from IoTConsensus WAL. Each instance + * manages consensus prefetching queues for a single consumer group. + */ +public class ConsensusSubscriptionBroker implements ISubscriptionBroker { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusSubscriptionBroker.class); + + private final String brokerId; // consumer group id + + /** Maps topic name to a list of ConsensusPrefetchingQueues, one per data region. */ + private final Map> topicNameToConsensusPrefetchingQueues; + + /** Shared commit ID generators per topic. */ + private final Map topicNameToCommitIdGenerator; + + public ConsensusSubscriptionBroker(final String brokerId) { + this.brokerId = brokerId; + this.topicNameToConsensusPrefetchingQueues = new ConcurrentHashMap<>(); + this.topicNameToCommitIdGenerator = new ConcurrentHashMap<>(); + } + + @Override + public boolean isEmpty() { + return topicNameToConsensusPrefetchingQueues.isEmpty(); + } + + @Override + public boolean hasQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + return Objects.nonNull(queues) + && !queues.isEmpty() + && queues.stream().anyMatch(q -> !q.isClosed()); + } + + //////////////////////////// poll //////////////////////////// + + @Override + public List poll( + final String consumerId, final Set topicNames, final long maxBytes) { + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll called, consumerId={}, topicNames={}, " + + "queueCount={}, maxBytes={}", + brokerId, + consumerId, + topicNames, + topicNameToConsensusPrefetchingQueues.size(), + maxBytes); + + final List eventsToPoll = new ArrayList<>(); + final List eventsToNack = new ArrayList<>(); + long totalSize = 0; + + for (final String topicName : topicNames) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + continue; + } + + // Poll from all region queues for this topic + for (final ConsensusPrefetchingQueue consensusQueue : queues) { + if (consensusQueue.isClosed()) { + continue; + } + + final SubscriptionEvent event = consensusQueue.poll(consumerId); + if (Objects.isNull(event)) { + continue; + } + + final long currentSize; + try { + currentSize = event.getCurrentResponseSize(); + } catch (final IOException e) { + eventsToNack.add(event); + continue; + } + + eventsToPoll.add(event); + totalSize += currentSize; + + if (totalSize + currentSize > maxBytes) { + break; + } + } + + if (totalSize > maxBytes) { + break; + } + } + + // Nack any events that had errors + if (!eventsToNack.isEmpty()) { + commit( + consumerId, + eventsToNack.stream() + .map(SubscriptionEvent::getCommitContext) + .collect(Collectors.toList()), + true); + } + + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll result, consumerId={}, eventsPolled={}, eventsNacked={}", + brokerId, + consumerId, + eventsToPoll.size(), + eventsToNack.size()); + + return eventsToPoll; + } + + @Override + public List pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return Collections.emptyList(); + } + + // Try each region queue until one returns a match + for (final ConsensusPrefetchingQueue consensusQueue : queues) { + if (consensusQueue.isClosed()) { + continue; + } + final SubscriptionEvent event = consensusQueue.pollTablets(consumerId, commitContext, offset); + if (Objects.nonNull(event)) { + return Collections.singletonList(event); + } + } + return Collections.emptyList(); + } + + //////////////////////////// commit //////////////////////////// + + @Override + public List commit( + final String consumerId, + final List commitContexts, + final boolean nack) { + final List successfulCommitContexts = new ArrayList<>(); + for (final SubscriptionCommitContext commitContext : commitContexts) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to commit", + brokerId, + topicName); + continue; + } + + // Try each region queue for this topic (the event belongs to exactly one region). + // Don't warn per-queue miss — only warn if NO queue handled the commit. + boolean handled = false; + for (final ConsensusPrefetchingQueue consensusQueue : queues) { + if (consensusQueue.isClosed()) { + continue; + } + final boolean success; + if (!nack) { + success = consensusQueue.ackSilent(consumerId, commitContext); + } else { + success = consensusQueue.nackSilent(consumerId, commitContext); + } + if (success) { + successfulCommitContexts.add(commitContext); + handled = true; + break; // committed in the right queue, no need to try others + } + } + if (!handled) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: commit context {} not found in any of {} region queue(s) for topic [{}]", + brokerId, + commitContext, + queues.size(), + topicName); + } + } + return successfulCommitContexts; + } + + @Override + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return true; + } + // Any queue that considers it NOT outdated means it's not outdated + for (final ConsensusPrefetchingQueue q : queues) { + if (!q.isCommitContextOutdated(commitContext)) { + return false; + } + } + return true; + } + + //////////////////////////// prefetching //////////////////////////// + + @Override + public boolean executePrefetch(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return false; + } + boolean anyPrefetched = false; + for (final ConsensusPrefetchingQueue q : queues) { + if (!q.isClosed() && q.executePrefetch()) { + anyPrefetched = true; + } + } + return anyPrefetched; + } + + @Override + public int getEventCount(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues)) { + return 0; + } + return queues.stream().mapToInt(ConsensusPrefetchingQueue::getPrefetchedEventCount).sum(); + } + + @Override + public int getQueueCount() { + return topicNameToConsensusPrefetchingQueues.size(); + } + + //////////////////////////// queue management //////////////////////////// + + public void bindConsensusPrefetchingQueue( + final String topicName, + final String consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final long startSearchIndex) { + // Get or create the list of queues for this topic + final List queues = + topicNameToConsensusPrefetchingQueues.computeIfAbsent( + topicName, k -> new CopyOnWriteArrayList<>()); + + // Check for duplicate region binding + for (final ConsensusPrefetchingQueue existing : queues) { + if (consensusGroupId.equals(existing.getConsensusGroupId()) && !existing.isClosed()) { + LOGGER.info( + "Subscription: consensus prefetching queue for topic [{}], region [{}] " + + "in consumer group [{}] already exists, skipping", + topicName, + consensusGroupId, + brokerId); + return; + } + } + + // Get or create the shared commit ID generator for this topic + final AtomicLong sharedCommitIdGenerator = + topicNameToCommitIdGenerator.computeIfAbsent(topicName, k -> new AtomicLong(0)); + + final ConsensusPrefetchingQueue consensusQueue = + new ConsensusPrefetchingQueue( + brokerId, + topicName, + consensusGroupId, + serverImpl, + converter, + commitManager, + startSearchIndex, + sharedCommitIdGenerator); + queues.add(consensusQueue); + LOGGER.info( + "Subscription: create consensus prefetching queue bound to topic [{}] for consumer group [{}], " + + "consensusGroupId={}, startSearchIndex={}, totalRegionQueues={}", + topicName, + brokerId, + consensusGroupId, + startSearchIndex, + queues.size()); + } + + public void unbindConsensusPrefetchingQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "Subscription: consensus prefetching queues bound to topic [{}] for consumer group [{}] do not exist", + topicName, + brokerId); + return; + } + + for (final ConsensusPrefetchingQueue q : queues) { + q.close(); + } + topicNameToConsensusPrefetchingQueues.remove(topicName); + topicNameToCommitIdGenerator.remove(topicName); + LOGGER.info( + "Subscription: drop all {} consensus prefetching queue(s) bound to topic [{}] for consumer group [{}]", + queues.size(), + topicName, + brokerId); + } + + @Override + public void removeQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.nonNull(queues) && !queues.isEmpty()) { + LOGGER.info( + "Subscription: consensus prefetching queue(s) bound to topic [{}] for consumer group [{}] still exist, unbind before closing", + topicName, + brokerId); + unbindConsensusPrefetchingQueue(topicName); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java new file mode 100644 index 0000000000000..aaa88a5f84777 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; + +import java.util.List; +import java.util.Set; + +public interface ISubscriptionBroker { + + List poll(String consumerId, Set topicNames, long maxBytes); + + List pollTablets( + String consumerId, SubscriptionCommitContext commitContext, int offset); + + List commit( + String consumerId, List commitContexts, boolean nack); + + boolean isCommitContextOutdated(SubscriptionCommitContext commitContext); + + boolean executePrefetch(String topicName); + + int getEventCount(String topicName); + + int getQueueCount(); + + void removeQueue(String topicName); + + boolean isEmpty(); + + boolean hasQueue(String topicName); +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java index cc03f7261419b..8f9d05324e905 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java @@ -56,7 +56,7 @@ import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; -public class SubscriptionBroker { +public class SubscriptionBroker implements ISubscriptionBroker { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBroker.class); @@ -83,14 +83,23 @@ public SubscriptionBroker(final String brokerId) { .build(consumerId -> new SubscriptionStates()); } + @Override public boolean isEmpty() { return topicNameToPrefetchingQueue.isEmpty() && completedTopicNames.isEmpty() && topicNameToCommitIdGenerator.isEmpty(); } + @Override + public boolean hasQueue(final String topicName) { + final SubscriptionPrefetchingQueue prefetchingQueue = + topicNameToPrefetchingQueue.get(topicName); + return Objects.nonNull(prefetchingQueue) && !prefetchingQueue.isClosed(); + } + //////////////////////////// provided for SubscriptionBrokerAgent //////////////////////////// + @Override public List poll( final String consumerId, final Set topicNames, final long maxBytes) { final List eventsToPoll = new ArrayList<>(); @@ -112,9 +121,10 @@ public List poll( // Iterate over each sorted topic name and poll the corresponding events int remainingTopicSize = sortedTopicNames.size(); for (final String topicName : sortedTopicNames) { + remainingTopicSize -= 1; + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); - remainingTopicSize -= 1; // Recheck if (Objects.isNull(prefetchingQueue) || prefetchingQueue.isClosed()) { @@ -182,6 +192,7 @@ private Set prepareCandidateTopicNames( final List eventsToPoll /* output parameter */) { final Set candidateTopicNames = new HashSet<>(); for (final String topicName : topicNames) { + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); // If there is no prefetching queue for the topic, check if it's completed @@ -271,6 +282,7 @@ public List pollTsFile( return Collections.emptyList(); } + @Override public List pollTablets( final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { final String topicName = commitContext.getTopicName(); @@ -312,6 +324,7 @@ public List pollTablets( /** * @return list of successful commit contexts */ + @Override public List commit( final String consumerId, final List commitContexts, @@ -348,6 +361,7 @@ public List commit( return successfulCommitContexts; } + @Override public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String topicName = commitContext.getTopicName(); final SubscriptionPrefetchingQueue prefetchingQueue = @@ -457,6 +471,11 @@ public void unbindPrefetchingQueue(final String topicName) { brokerId); } + @Override + public void removeQueue(final String topicName) { + removePrefetchingQueue(topicName); + } + public void removePrefetchingQueue(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -473,6 +492,7 @@ public void removePrefetchingQueue(final String topicName) { topicNameToCommitIdGenerator.remove(topicName); } + @Override public boolean executePrefetch(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -505,6 +525,11 @@ public boolean executePrefetch(final String topicName) { : prefetchingQueue.executePrefetchV2(); } + @Override + public int getEventCount(final String topicName) { + return getPipeEventCount(topicName); + } + public int getPipeEventCount(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -525,6 +550,11 @@ public int getPipeEventCount(final String topicName) { return prefetchingQueue.getPipeEventCount(); } + @Override + public int getQueueCount() { + return getPrefetchingQueueCount(); + } + public int getPrefetchingQueueCount() { return topicNameToPrefetchingQueue.size(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java new file mode 100644 index 0000000000000..fbde6cee8c2fe --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertTabletNode; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.file.metadata.IDeviceID; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.utils.BitMap; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +/** Converts IoTConsensus WAL log entries (InsertNode) to Tablet format for subscription. */ +public class ConsensusLogToTabletConverter { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusLogToTabletConverter.class); + + private final TreePattern treePattern; + private final TablePattern tablePattern; + + /** + * The actual database name of the DataRegion this converter processes (table-model format without + * "root." prefix). Null for tree-model topics. + */ + private final String databaseName; + + public ConsensusLogToTabletConverter( + final TreePattern treePattern, final TablePattern tablePattern, final String databaseName) { + this.treePattern = treePattern; + this.tablePattern = tablePattern; + this.databaseName = databaseName; + } + + public String getDatabaseName() { + return databaseName; + } + + static String safeDeviceIdForLog(final InsertNode node) { + try { + final Object deviceId = node.getDeviceID(); + return deviceId != null ? deviceId.toString() : "null"; + } catch (final Exception e) { + return "N/A(" + node.getType() + ")"; + } + } + + public List convert(final InsertNode insertNode) { + if (Objects.isNull(insertNode)) { + return Collections.emptyList(); + } + + final PlanNodeType nodeType = insertNode.getType(); + if (nodeType == null) { + LOGGER.warn("InsertNode type is null, skipping conversion"); + return Collections.emptyList(); + } + + LOGGER.debug( + "ConsensusLogToTabletConverter: converting InsertNode type={}, deviceId={}", + nodeType, + safeDeviceIdForLog(insertNode)); + + switch (nodeType) { + case INSERT_ROW: + return convertInsertRowNode((InsertRowNode) insertNode); + case INSERT_TABLET: + return convertInsertTabletNode((InsertTabletNode) insertNode); + case INSERT_ROWS: + return convertInsertRowsNode((InsertRowsNode) insertNode); + case INSERT_ROWS_OF_ONE_DEVICE: + return convertInsertRowsOfOneDeviceNode((InsertRowsOfOneDeviceNode) insertNode); + case INSERT_MULTI_TABLET: + return convertInsertMultiTabletsNode((InsertMultiTabletsNode) insertNode); + case RELATIONAL_INSERT_ROW: + return convertRelationalInsertRowNode((RelationalInsertRowNode) insertNode); + case RELATIONAL_INSERT_TABLET: + return convertRelationalInsertTabletNode((RelationalInsertTabletNode) insertNode); + case RELATIONAL_INSERT_ROWS: + return convertRelationalInsertRowsNode((RelationalInsertRowsNode) insertNode); + default: + LOGGER.debug("Unsupported InsertNode type for subscription: {}", nodeType); + return Collections.emptyList(); + } + } + + // ======================== Tree Model Conversion ======================== + + private List convertInsertRowNode(final InsertRowNode node) { + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final long time = node.getTime(); + + // Determine which columns match the pattern + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + // Build Tablet with matched columns + final int columnCount = matchedColumnIndices.size(); + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + final Tablet tablet = new Tablet(deviceId.toString(), schemas, 1 /* maxRowNumber */); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = matchedColumnIndices.get(i); + final Object value = values[originalColIdx]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[originalColIdx], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertInsertTabletNode(final InsertTabletNode node) { + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + + // Column filtering + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + // Build Tablet with all rows + final int columnCount = matchedColumnIndices.size(); + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + final Tablet tablet = new Tablet(deviceId.toString(), schemas, rowCount); + + for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) { + tablet.addTimestamp(rowIdx, times[rowIdx]); + + for (int colIdx = 0; colIdx < columnCount; colIdx++) { + final int originalColIdx = matchedColumnIndices.get(colIdx); + final boolean isNull = + (bitMaps != null + && bitMaps[originalColIdx] != null + && bitMaps[originalColIdx].isMarked(rowIdx)); + + if (isNull) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[colIdx].mark(rowIdx); + } else { + copyColumnValue( + tablet, rowIdx, colIdx, dataTypes[originalColIdx], columns[originalColIdx], rowIdx); + } + } + } + tablet.setRowSize(rowCount); + + return Collections.singletonList(tablet); + } + + private List convertInsertRowsNode(final InsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + // Handle merge bug: RelationalInsertRowNode.mergeInsertNode() is not overridden, + // so merged relational nodes arrive as InsertRowsNode (tree) with RelationalInsertRowNode + // children. Dispatch correctly by checking the actual child type. + if (rowNode instanceof RelationalInsertRowNode) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } else { + tablets.addAll(convertInsertRowNode(rowNode)); + } + } + return tablets; + } + + private List convertInsertRowsOfOneDeviceNode(final InsertRowsOfOneDeviceNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertInsertRowNode(rowNode)); + } + return tablets; + } + + private List convertInsertMultiTabletsNode(final InsertMultiTabletsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertTabletNode tabletNode : node.getInsertTabletNodeList()) { + tablets.addAll(convertInsertTabletNode(tabletNode)); + } + return tablets; + } + + // ======================== Table Model Conversion ======================== + + private List convertRelationalInsertRowNode(final RelationalInsertRowNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final long time = node.getTime(); + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + + final int columnCount = measurements.length; + final List schemas = new ArrayList<>(columnCount); + for (int i = 0; i < columnCount; i++) { + schemas.add(new MeasurementSchema(measurements[i], dataTypes[i])); + } + + final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, 1); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final Object value = values[i]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[i], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertTabletNode(final RelationalInsertTabletNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + + final int columnCount = measurements.length; + final List schemas = new ArrayList<>(columnCount); + for (int i = 0; i < columnCount; i++) { + schemas.add(new MeasurementSchema(measurements[i], dataTypes[i])); + } + + final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, rowCount); + + for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) { + tablet.addTimestamp(rowIdx, times[rowIdx]); + + for (int colIdx = 0; colIdx < columnCount; colIdx++) { + final boolean isNull = + (bitMaps != null && bitMaps[colIdx] != null && bitMaps[colIdx].isMarked(rowIdx)); + + if (isNull) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[colIdx].mark(rowIdx); + } else { + copyColumnValue(tablet, rowIdx, colIdx, dataTypes[colIdx], columns[colIdx], rowIdx); + } + } + } + tablet.setRowSize(rowCount); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertRowsNode(final RelationalInsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } + return tablets; + } + + // ======================== Helper Methods ======================== + + /** + * Returns indices of columns that match the tree pattern. If no tree pattern is specified, all + * column indices are returned. + */ + private List getMatchedTreeColumnIndices( + final IDeviceID deviceId, final String[] measurements) { + if (treePattern == null || treePattern.isRoot() || treePattern.coversDevice(deviceId)) { + // All columns match + final List allIndices = new ArrayList<>(measurements.length); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null) { + allIndices.add(i); + } + } + return allIndices; + } + + final List matchedIndices = new ArrayList<>(); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null && treePattern.matchesMeasurement(deviceId, measurements[i])) { + matchedIndices.add(i); + } + } + return matchedIndices; + } + + /** + * Adds a single value to the tablet at the specified position. + * + *

IMPORTANT: In tsfile-2.2.1, Tablet.addTimestamp() calls initBitMapsWithApiUsage() which + * creates bitMaps and marks ALL positions as null via markAll(). Since we write values directly + * to the underlying typed arrays (bypassing the Tablet.addValue() API which would call + * updateBitMap to unmark), we must explicitly unmark the bitmap position to indicate the value is + * NOT null. + */ + private void addValueToTablet( + final Tablet tablet, + final int rowIndex, + final int columnIndex, + final TSDataType dataType, + final Object value) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[columnIndex])[rowIndex] = (boolean) value; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[columnIndex])[rowIndex] = (int) value; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[columnIndex])[rowIndex] = (long) value; + break; + case FLOAT: + ((float[]) tablet.getValues()[columnIndex])[rowIndex] = (float) value; + break; + case DOUBLE: + ((double[]) tablet.getValues()[columnIndex])[rowIndex] = (double) value; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[columnIndex])[rowIndex] = (Binary) value; + break; + default: + LOGGER.warn("Unsupported data type: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + // addTimestamp() triggers initBitMapsWithApiUsage() which marks all positions as null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[columnIndex] != null) { + bitMaps[columnIndex].unmark(rowIndex); + } + } + + /** Copies a single column value from the source column array to the tablet. */ + private void copyColumnValue( + final Tablet tablet, + final int targetRowIndex, + final int targetColumnIndex, + final TSDataType dataType, + final Object sourceColumn, + final int sourceRowIndex) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((boolean[]) sourceColumn)[sourceRowIndex]; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((int[]) sourceColumn)[sourceRowIndex]; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((long[]) sourceColumn)[sourceRowIndex]; + break; + case FLOAT: + ((float[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((float[]) sourceColumn)[sourceRowIndex]; + break; + case DOUBLE: + ((double[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((double[]) sourceColumn)[sourceRowIndex]; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((Binary[]) sourceColumn)[sourceRowIndex]; + break; + default: + LOGGER.warn("Unsupported data type for copy: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[targetColumnIndex] != null) { + bitMaps[targetColumnIndex].unmark(targetRowIndex); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java new file mode 100644 index 0000000000000..28743d1aae73c --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java @@ -0,0 +1,1179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.consensus.common.request.IConsensusRequest; +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.common.request.IoTConsensusRequest; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; +import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload; + +import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.write.record.Tablet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.LongSupplier; + +import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; + +/** + * A prefetching queue that reads data from IoTConsensus using a hybrid approach: + * + *

    + *
  1. In-memory pending queue: Registered with {@link IoTConsensusServerImpl}, receives + * {@link IndexedConsensusRequest} in real-time from the write path (same mechanism as + * LogDispatcher). This avoids waiting for WAL flush to disk. + *
  2. WAL fallback: Uses {@link ConsensusReqReader.ReqIterator} to read from WAL files for + * gap-filling (pending queue overflow) or catch-up scenarios. + *
  3. WAL pinning: Supplies the earliest outstanding (uncommitted) search index to {@link + * IoTConsensusServerImpl}, preventing WAL deletion of entries not yet consumed by the + * subscription. + *
+ * + *

A background prefetch thread continuously drains the pending queue, converts InsertNode + * entries to Tablets via {@link ConsensusLogToTabletConverter}, and enqueues {@link + * SubscriptionEvent} objects into the prefetchingQueue for consumer polling. + * + *

This design mirrors LogDispatcher's dual-path (pendingEntries + WAL reader) but targets + * subscription delivery instead of replication. + * + *

Thread safety: Uses a fair {@link ReentrantReadWriteLock} to ensure mutual exclusion between + * cleanup and other operations (poll, ack, nack), consistent with the existing prefetching queue + * design. + */ +public class ConsensusPrefetchingQueue { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchingQueue.class); + + private final String brokerId; // consumer group id + private final String topicName; + private final String consensusGroupId; + + private final IoTConsensusServerImpl serverImpl; + + private final ConsensusReqReader consensusReqReader; + + private volatile ConsensusReqReader.ReqIterator reqIterator; + + /** + * In-memory pending queue registered with {@link IoTConsensusServerImpl#write}. Receives + * IndexedConsensusRequest in real-time without waiting for WAL flush. Capacity is bounded to + * apply back-pressure; overflows are filled from WAL. + */ + private final BlockingQueue pendingEntries; + + private static final int PENDING_QUEUE_CAPACITY = 4096; + + private final ConsensusLogToTabletConverter converter; + + private final ConsensusSubscriptionCommitManager commitManager; + + /** + * Cached LongSupplier instance for WAL pinning registration. Must be the SAME object reference + * for both registerSubscriptionQueue and unregisterSubscriptionQueue, because + * CopyOnWriteArrayList.remove() uses equals() which defaults to reference equality for lambdas. + * Using this::method would create a new lambda instance each time, causing remove() to fail and + * WAL to be pinned indefinitely. + */ + private final LongSupplier walPinSupplier; + + /** Commit ID generator, monotonically increasing within this queue's lifetime. */ + private final AtomicLong commitIdGenerator; + + /** Records the initial commit ID for outdated event detection. */ + private final long initialCommitId; + + private final AtomicLong nextExpectedSearchIndex; + + private final PriorityBlockingQueue prefetchingQueue; + + /** + * Tracks in-flight events that have been polled but not yet committed. Key: (consumerId, + * commitContext) -> event. + */ + private final Map, SubscriptionEvent> inFlightEvents; + + /** + * Tracks outstanding (uncommitted) events for WAL pinning. Maps commitId to the startSearchIndex + * of that event batch. The earliest entry's value is supplied to IoTConsensusServerImpl to pin + * WAL files from deletion. + */ + private final ConcurrentSkipListMap outstandingCommitIdToStartIndex; + + private static final int MAX_TABLETS_PER_EVENT = 64; + + private static final int MAX_WAL_ENTRIES_PER_PREFETCH = 128; + + private static final int MAX_PREFETCHING_QUEUE_SIZE = 256; + + private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true); + + private volatile boolean isClosed = false; + + /** + * Background thread that drains pendingEntries and fills prefetchingQueue. TODO: manage thread + * count + */ + private final Thread prefetchThread; + + public ConsensusPrefetchingQueue( + final String brokerId, + final String topicName, + final String consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final long startSearchIndex, + final AtomicLong sharedCommitIdGenerator) { + this.brokerId = brokerId; + this.topicName = topicName; + this.consensusGroupId = consensusGroupId; + this.serverImpl = serverImpl; + this.consensusReqReader = serverImpl.getConsensusReqReader(); + this.converter = converter; + this.commitManager = commitManager; + + this.commitIdGenerator = sharedCommitIdGenerator; + this.initialCommitId = commitIdGenerator.get(); + this.nextExpectedSearchIndex = new AtomicLong(startSearchIndex); + this.reqIterator = consensusReqReader.getReqIterator(startSearchIndex); + + this.prefetchingQueue = new PriorityBlockingQueue<>(); + this.inFlightEvents = new ConcurrentHashMap<>(); + this.outstandingCommitIdToStartIndex = new ConcurrentSkipListMap<>(); + + // Create and register the in-memory pending queue with IoTConsensusServerImpl. + // IMPORTANT: walPinSupplier is stored as a field (not a method reference) to ensure the + // same object reference is used for both register and unregister. + this.pendingEntries = new ArrayBlockingQueue<>(PENDING_QUEUE_CAPACITY); + this.walPinSupplier = this::getEarliestOutstandingSearchIndex; + serverImpl.registerSubscriptionQueue(pendingEntries, walPinSupplier); + + // Start background prefetch thread + this.prefetchThread = + new Thread(this::prefetchLoop, "ConsensusPrefetch-" + brokerId + "-" + topicName); + this.prefetchThread.setDaemon(true); + this.prefetchThread.start(); + + LOGGER.info( + "ConsensusPrefetchingQueue created: brokerId={}, topicName={}, consensusGroupId={}, " + + "startSearchIndex={}", + brokerId, + topicName, + consensusGroupId, + startSearchIndex); + } + + /** + * Returns the earliest outstanding (uncommitted) search index for WAL pinning. If there are no + * outstanding events, returns the next expected search index (nothing to pin beyond what we've + * already processed). + */ + private long getEarliestOutstandingSearchIndex() { + final Map.Entry first = outstandingCommitIdToStartIndex.firstEntry(); + if (first != null) { + return first.getValue(); + } + return nextExpectedSearchIndex.get(); + } + + // ======================== Lock Operations ======================== + + private void acquireReadLock() { + lock.readLock().lock(); + } + + private void releaseReadLock() { + lock.readLock().unlock(); + } + + private void acquireWriteLock() { + lock.writeLock().lock(); + } + + private void releaseWriteLock() { + lock.writeLock().unlock(); + } + + // ======================== Poll ======================== + + public SubscriptionEvent poll(final String consumerId) { + acquireReadLock(); + try { + return isClosed ? null : pollInternal(consumerId); + } finally { + releaseReadLock(); + } + } + + private SubscriptionEvent pollInternal(final String consumerId) { + // Recycle any uncommitted in-flight events for this consumer before serving new data. + final int recycled = recycleInFlightEventsForConsumer(consumerId); + if (recycled > 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: recycled {} uncommitted in-flight events for " + + "consumer {} back to prefetching queue", + this, + recycled, + consumerId); + } + + final long size = prefetchingQueue.size(); + if (size == 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: prefetching queue is empty for consumerId={}, " + + "pendingEntriesSize={}, nextExpected={}, isClosed={}, threadAlive={}", + this, + consumerId, + pendingEntries.size(), + nextExpectedSearchIndex.get(), + isClosed, + prefetchThread.isAlive()); + return null; + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: polling, queue size={}, consumerId={}", + this, + size, + consumerId); + long count = 0; + + SubscriptionEvent event; + try { + while (count++ < size + && Objects.nonNull( + event = + prefetchingQueue.poll( + SubscriptionConfig.getInstance().getSubscriptionPollMaxBlockingTimeMs(), + TimeUnit.MILLISECONDS))) { + if (event.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll committed event {} (broken invariant), remove it", + this, + event); + continue; + } + + if (!event.pollable()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll non-pollable event {} (broken invariant), nack it", + this, + event); + event.nack(); + continue; + } + + // Mark as polled before updating inFlightEvents + event.recordLastPolledTimestamp(); + inFlightEvents.put(new Pair<>(consumerId, event.getCommitContext()), event); + event.recordLastPolledConsumerId(consumerId); + return event; + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + LOGGER.warn("ConsensusPrefetchingQueue {} interrupted while polling", this, e); + } + + return null; + } + + public SubscriptionEvent pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + acquireReadLock(); + try { + if (isClosed) { + return null; + } + final SubscriptionEvent event = inFlightEvents.get(new Pair<>(consumerId, commitContext)); + if (Objects.isNull(event)) { + if (isCommitContextOutdated(commitContext)) { + return generateOutdatedErrorResponse(); + } + return generateErrorResponse( + String.format( + "ConsensusPrefetchingQueue %s: no in-flight event for consumer %s, commit context %s", + this, consumerId, commitContext)); + } + return event; + } finally { + releaseReadLock(); + } + } + + // ======================== Background Prefetch ======================== + + public boolean executePrefetch() { + acquireReadLock(); + try { + if (isClosed) { + return false; + } + // Recycle pollable events from inFlightEvents back to prefetchingQueue + recycleInFlightEvents(); + return !prefetchingQueue.isEmpty(); + } finally { + releaseReadLock(); + } + } + + private static final long PENDING_DRAIN_TIMEOUT_MS = 200; + + private static final long WAL_WAIT_TIMEOUT_SECONDS = 2; + + /** + * Background prefetch loop. Continuously drains from pendingEntries (in-memory, real-time), + * detects gaps and fills from WAL reader, converts to Tablets, and enqueues SubscriptionEvents. + */ + private void prefetchLoop() { + LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread started", this); + try { + while (!isClosed && !Thread.currentThread().isInterrupted()) { + try { + // Back-pressure: wait if prefetchingQueue is full + if (prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + Thread.sleep(50); + continue; + } + + // Try to drain from pending entries (in-memory, fast path) + final List batch = new ArrayList<>(); + // Block briefly for first entry + final IndexedConsensusRequest first = + pendingEntries.poll(PENDING_DRAIN_TIMEOUT_MS, TimeUnit.MILLISECONDS); + if (first != null) { + batch.add(first); + // Drain more non-blocking + int drained = 0; + IndexedConsensusRequest next; + while (drained < MAX_WAL_ENTRIES_PER_PREFETCH - 1 + && (next = pendingEntries.poll()) != null) { + batch.add(next); + drained++; + } + } + + if (!batch.isEmpty()) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: drained {} entries from pendingEntries, " + + "first searchIndex={}, last searchIndex={}, nextExpected={}, " + + "prefetchingQueueSize={}", + this, + batch.size(), + batch.get(0).getSearchIndex(), + batch.get(batch.size() - 1).getSearchIndex(), + nextExpectedSearchIndex.get(), + prefetchingQueue.size()); + processBatchFromPending(batch); + } else { + // Pending queue was empty - try catch-up from WAL for any gaps + // (entries may have been dropped due to pending queue overflow) + tryCatchUpFromWAL(); + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } catch (final Throwable t) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: CRITICAL error in prefetch loop " + + "(type={}, message={})", + this, + t.getClass().getName(), + t.getMessage(), + t); + if (t instanceof Error) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: caught Error in prefetch loop, " + + "will attempt to continue", + this); + } + try { + Thread.sleep(100); + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + break; + } + } + } + } catch (final Throwable fatal) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: FATAL uncaught throwable escaped prefetch loop " + + "(type={}, message={})", + this, + fatal.getClass().getName(), + fatal.getMessage(), + fatal); + } + LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread stopped", this); + } + + private void processBatchFromPending(final List batch) { + final List batchedTablets = new ArrayList<>(); + long batchStartSearchIndex = nextExpectedSearchIndex.get(); + long batchEndSearchIndex = batchStartSearchIndex; + int processedCount = 0; + int skippedCount = 0; + int nullDeserCount = 0; + int emptyConvertCount = 0; + + for (final IndexedConsensusRequest request : batch) { + final long searchIndex = request.getSearchIndex(); + + // Detect gap: if searchIndex > nextExpected, entries were dropped from pending queue. + // Fill the gap from WAL. + final long expected = nextExpectedSearchIndex.get(); + if (searchIndex > expected) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: gap detected, expected={}, got={}. " + + "Filling {} entries from WAL.", + this, + expected, + searchIndex, + searchIndex - expected); + fillGapFromWAL(expected, searchIndex, batchedTablets); + } + + if (searchIndex < nextExpectedSearchIndex.get()) { + // Already processed (e.g., gap fill covered this entry), skip + skippedCount++; + continue; + } + + // Process this entry + final InsertNode insertNode = deserializeToInsertNode(request); + if (insertNode != null) { + final List tablets = converter.convert(insertNode); + if (!tablets.isEmpty()) { + batchedTablets.addAll(tablets); + batchEndSearchIndex = searchIndex; + processedCount++; + } else { + emptyConvertCount++; + LOGGER.debug( + "ConsensusPrefetchingQueue {}: converter returned empty tablets for " + + "searchIndex={}, insertNodeType={}, deviceId={}", + this, + searchIndex, + insertNode.getType(), + ConsensusLogToTabletConverter.safeDeviceIdForLog(insertNode)); + } + } else { + nullDeserCount++; + LOGGER.warn( + "ConsensusPrefetchingQueue {}: deserializeToInsertNode returned null for " + + "searchIndex={}, requestType={}", + this, + searchIndex, + request.getRequests().isEmpty() + ? "EMPTY" + : request.getRequests().get(0).getClass().getSimpleName()); + } + nextExpectedSearchIndex.set(searchIndex + 1); + + // Flush batch if large enough + if (batchedTablets.size() >= MAX_TABLETS_PER_EVENT) { + createAndEnqueueEvent( + new ArrayList<>(batchedTablets), batchStartSearchIndex, batchEndSearchIndex); + batchedTablets.clear(); + // Reset start index for the next sub-batch so that + // outstandingCommitIdToStartIndex records the correct WAL pin position + batchStartSearchIndex = nextExpectedSearchIndex.get(); + } + } + + // Update WAL reader position to stay in sync + syncReqIteratorPosition(); + + // Flush remaining tablets + if (!batchedTablets.isEmpty()) { + createAndEnqueueEvent(batchedTablets, batchStartSearchIndex, batchEndSearchIndex); + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: batch processing complete, " + + "batchSize={}, processed={}, skipped={}, nullDeser={}, emptyConvert={}, " + + "tabletsCreated={}, nextExpected={}, prefetchQueueSize={}", + this, + batch.size(), + processedCount, + skippedCount, + nullDeserCount, + emptyConvertCount, + batchedTablets.size(), + nextExpectedSearchIndex.get(), + prefetchingQueue.size()); + } + + /** + * Fills a gap in the pending queue by reading entries from WAL. Called when gap is detected + * between nextExpectedSearchIndex and an incoming entry's searchIndex. + */ + private void fillGapFromWAL( + final long fromIndex, final long toIndex, final List batchedTablets) { + // Re-position WAL reader to the gap start + reqIterator = consensusReqReader.getReqIterator(fromIndex); + + while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) { + try { + final IndexedConsensusRequest walEntry = reqIterator.next(); + final long walIndex = walEntry.getSearchIndex(); + if (walIndex < nextExpectedSearchIndex.get()) { + continue; // already processed + } + + final InsertNode insertNode = deserializeToInsertNode(walEntry); + if (insertNode != null) { + final List tablets = converter.convert(insertNode); + batchedTablets.addAll(tablets); + } + nextExpectedSearchIndex.set(walIndex + 1); + } catch (final Exception e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: error filling gap from WAL at index {}", + this, + nextExpectedSearchIndex.get(), + e); + break; + } + } + + // If WAL doesn't have the gap entries yet (still in memory buffer), wait briefly + if (nextExpectedSearchIndex.get() < toIndex) { + try { + reqIterator.waitForNextReady(WAL_WAIT_TIMEOUT_SECONDS, TimeUnit.SECONDS); + while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) { + final IndexedConsensusRequest walEntry = reqIterator.next(); + final long walIndex = walEntry.getSearchIndex(); + if (walIndex < nextExpectedSearchIndex.get()) { + continue; + } + final InsertNode insertNode = deserializeToInsertNode(walEntry); + if (insertNode != null) { + final List tablets = converter.convert(insertNode); + batchedTablets.addAll(tablets); + } + nextExpectedSearchIndex.set(walIndex + 1); + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + } catch (final TimeoutException e) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: timeout waiting for WAL gap fill [{}, {})", + this, + nextExpectedSearchIndex.get(), + toIndex); + } + } + } + + /** + * Try catch-up from WAL when the pending queue was empty. This handles cold-start or scenarios + * where the subscription started after data was already written. + */ + private void tryCatchUpFromWAL() { + // Re-position WAL reader + syncReqIteratorPosition(); + + if (!reqIterator.hasNext()) { + // No data on disk either - nothing to do + return; + } + + final List batchedTablets = new ArrayList<>(); + long batchStartSearchIndex = nextExpectedSearchIndex.get(); + long batchEndSearchIndex = batchStartSearchIndex; + int entriesRead = 0; + + while (entriesRead < MAX_WAL_ENTRIES_PER_PREFETCH + && reqIterator.hasNext() + && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) { + try { + final IndexedConsensusRequest walEntry = reqIterator.next(); + final long walIndex = walEntry.getSearchIndex(); + entriesRead++; + + if (walIndex < nextExpectedSearchIndex.get()) { + continue; + } + + final InsertNode insertNode = deserializeToInsertNode(walEntry); + if (insertNode != null) { + final List tablets = converter.convert(insertNode); + if (!tablets.isEmpty()) { + batchedTablets.addAll(tablets); + batchEndSearchIndex = walIndex; + } + } + nextExpectedSearchIndex.set(walIndex + 1); + + if (batchedTablets.size() >= MAX_TABLETS_PER_EVENT) { + createAndEnqueueEvent( + new ArrayList<>(batchedTablets), batchStartSearchIndex, batchEndSearchIndex); + batchedTablets.clear(); + // Reset start index for the next sub-batch + batchStartSearchIndex = nextExpectedSearchIndex.get(); + } + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error reading WAL for catch-up", this, e); + break; + } + } + + if (!batchedTablets.isEmpty()) { + createAndEnqueueEvent(batchedTablets, batchStartSearchIndex, batchEndSearchIndex); + } + + if (entriesRead > 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: WAL catch-up read {} entries, " + + "nextExpectedSearchIndex={}", + this, + entriesRead, + nextExpectedSearchIndex.get()); + } + } + + /** + * Re-positions the WAL reader to the current nextExpectedSearchIndex. Called before reading from + * WAL to ensure the iterator is in sync with tracking position. + */ + private void syncReqIteratorPosition() { + reqIterator = consensusReqReader.getReqIterator(nextExpectedSearchIndex.get()); + } + + /** + * Deserializes the IConsensusRequest entries within an IndexedConsensusRequest to produce an + * InsertNode. WAL entries are typically stored as IoTConsensusRequest (serialized ByteBuffers), + * and a single logical write may be split across multiple fragments (SearchNode). This method + * handles both cases. + * + *

The deserialization follows the same pattern as {@code + * DataRegionStateMachine.grabPlanNode()}. + */ + private InsertNode deserializeToInsertNode(final IndexedConsensusRequest indexedRequest) { + final List searchNodes = new ArrayList<>(); + PlanNode nonSearchNode = null; + + for (final IConsensusRequest req : indexedRequest.getRequests()) { + PlanNode planNode; + try { + if (req instanceof IoTConsensusRequest) { + // WAL entries read from file are wrapped as IoTConsensusRequest (ByteBuffer) + planNode = WALEntry.deserializeForConsensus(req.serializeToByteBuffer()); + } else if (req instanceof InsertNode) { + // In-memory entries (not yet flushed to WAL file) may already be PlanNode + planNode = (PlanNode) req; + } else { + // ByteBufferConsensusRequest or unknown + planNode = PlanNodeType.deserialize(req.serializeToByteBuffer()); + } + } catch (final Exception e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to deserialize IConsensusRequest " + + "(type={}) in searchIndex={}: {}", + this, + req.getClass().getSimpleName(), + indexedRequest.getSearchIndex(), + e.getMessage(), + e); + continue; + } + + if (planNode instanceof SearchNode) { + ((SearchNode) planNode).setSearchIndex(indexedRequest.getSearchIndex()); + searchNodes.add((SearchNode) planNode); + } else { + nonSearchNode = planNode; + } + } + + // Merge split SearchNode fragments (same pattern as DataRegionStateMachine.grabPlanNode) + if (!searchNodes.isEmpty()) { + final PlanNode merged = searchNodes.get(0).merge(searchNodes); + if (merged instanceof InsertNode) { + final InsertNode mergedInsert = (InsertNode) merged; + LOGGER.debug( + "ConsensusPrefetchingQueue {}: deserialized merged InsertNode for searchIndex={}, " + + "type={}, deviceId={}, searchNodeCount={}", + this, + indexedRequest.getSearchIndex(), + mergedInsert.getType(), + ConsensusLogToTabletConverter.safeDeviceIdForLog(mergedInsert), + searchNodes.size()); + + return mergedInsert; + } + } + + if (nonSearchNode != null) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: searchIndex={} contains non-InsertNode PlanNode: {}", + this, + indexedRequest.getSearchIndex(), + nonSearchNode.getClass().getSimpleName()); + } + + return null; + } + + private void createAndEnqueueEvent( + final List tablets, final long startSearchIndex, final long endSearchIndex) { + if (tablets.isEmpty()) { + return; + } + + final long commitId = commitIdGenerator.getAndIncrement(); + + // Record the mapping from commitId to the end searchIndex + // so that when the client commits, we know which WAL position has been consumed + commitManager.recordCommitMapping( + brokerId, topicName, consensusGroupId, commitId, endSearchIndex); + + // Track outstanding event for WAL pinning + outstandingCommitIdToStartIndex.put(commitId, startSearchIndex); + + final SubscriptionCommitContext commitContext = + new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + commitId); + + // nextOffset <= 0 means all tablets delivered in single batch + // -tablets.size() indicates total count + // Use Map> constructor with actual database name for table model; + final TabletsPayload payload = + new TabletsPayload( + Collections.singletonMap(converter.getDatabaseName(), tablets), -tablets.size()); + + final SubscriptionEvent event = + new SubscriptionEvent( + SubscriptionPollResponseType.TABLETS.getType(), payload, commitContext); + + prefetchingQueue.add(event); + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: ENQUEUED event with {} tablets, " + + "searchIndex range [{}, {}], commitId={}, prefetchQueueSize={}", + this, + tablets.size(), + startSearchIndex, + endSearchIndex, + commitId, + prefetchingQueue.size()); + } + + // ======================== Commit (Ack/Nack) ======================== + + public boolean ack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return !isClosed && ackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + private boolean ackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final AtomicBoolean acked = new AtomicBoolean(false); + final long commitId = commitContext.getCommitId(); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for ack", + this, + commitContext); + return null; + } + + if (ev.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: event {} already committed", this, commitContext); + ev.cleanUp(false); + return null; + } + + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + + ev.cleanUp(false); + return null; + }); + + if (acked.get()) { + commitManager.commit(brokerId, topicName, consensusGroupId, commitId); + outstandingCommitIdToStartIndex.remove(commitId); + } + + return acked.get(); + } + + public boolean nack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return !isClosed && nackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + /** + * Silent version of ack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean ackSilent(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (isClosed) { + return false; + } + final AtomicBoolean acked = new AtomicBoolean(false); + final long commitId = commitContext.getCommitId(); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + ev.cleanUp(false); + return null; + }); + if (acked.get()) { + commitManager.commit(brokerId, topicName, consensusGroupId, commitId); + outstandingCommitIdToStartIndex.remove(commitId); + } + return acked.get(); + } finally { + releaseReadLock(); + } + } + + /** + * Silent version of nack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean nackSilent( + final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (isClosed) { + return false; + } + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + ev.nack(); + nacked.set(true); + prefetchingQueue.add(ev); + return null; + }); + return nacked.get(); + } finally { + releaseReadLock(); + } + } + + private boolean nackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for nack", + this, + commitContext); + return null; + } + + ev.nack(); + nacked.set(true); + prefetchingQueue.add(ev); + return null; + }); + + return nacked.get(); + } + + // ======================== Recycle ======================== + + /** Recycles in-flight events that are pollable (timed out) back to the prefetching queue. */ + private void recycleInFlightEvents() { + for (final Pair key : + new ArrayList<>(inFlightEvents.keySet())) { + inFlightEvents.compute( + key, + (k, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + if (ev.pollable()) { + ev.nack(); + prefetchingQueue.add(ev); + LOGGER.debug( + "ConsensusPrefetchingQueue {}: recycled timed-out event {} back to prefetching queue", + this, + ev); + return null; + } + return ev; + }); + } + } + + /** + * Maximum number of nack cycles before an in-flight event is kept in place rather than + * re-enqueued. Prevents infinite re-delivery loops when a consumer repeatedly polls without + * committing. Beyond this threshold, the event stays in inFlightEvents and will eventually be + * recycled by the timeout-based {@link #recycleInFlightEvents()} when it becomes pollable. + */ + private static final long MAX_CONSUMER_RECYCLE_NACK_COUNT = 10; + + /** + * Recycles uncommitted in-flight events belonging to the given consumer back to the prefetching + * queue. This provides at-least-once delivery: when a consumer polls again without committing, + * the previously delivered events are nacked and re-queued for re-delivery. + * + *

Events that have been nacked more than {@link #MAX_CONSUMER_RECYCLE_NACK_COUNT} times are + * left in-flight to avoid infinite re-delivery loops. They will be cleaned up by the periodic + * timeout-based recycler instead. + * + * @return the number of events recycled + */ + private int recycleInFlightEventsForConsumer(final String consumerId) { + final AtomicInteger count = new AtomicInteger(0); + for (final Pair key : + new ArrayList<>(inFlightEvents.keySet())) { + if (!key.getLeft().equals(consumerId)) { + continue; + } + inFlightEvents.compute( + key, + (k, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + // If the event has been nacked too many times, leave it and let the timeout recycler + // handle it. + if (ev.getNackCount() >= MAX_CONSUMER_RECYCLE_NACK_COUNT) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: event {} for consumer {} exceeded max nack " + + "count ({}), skipping recycle to prevent infinite loop", + this, + ev, + consumerId, + MAX_CONSUMER_RECYCLE_NACK_COUNT); + return ev; // keep in inFlightEvents + } + ev.nack(); + prefetchingQueue.add(ev); + count.incrementAndGet(); + LOGGER.debug( + "ConsensusPrefetchingQueue {}: recycled uncommitted event {} for consumer {} " + + "back to prefetching queue", + this, + ev, + consumerId); + return null; + }); + } + return count.get(); + } + + // ======================== Cleanup ======================== + + public void cleanUp() { + acquireWriteLock(); + try { + prefetchingQueue.forEach(event -> event.cleanUp(true)); + prefetchingQueue.clear(); + + inFlightEvents.values().forEach(event -> event.cleanUp(true)); + inFlightEvents.clear(); + } finally { + releaseWriteLock(); + } + } + + public void close() { + markClosed(); + // Stop background prefetch thread + prefetchThread.interrupt(); + try { + prefetchThread.join(5000); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + } + // Unregister from IoTConsensusServerImpl (stop receiving in-memory data, unpin WAL). + serverImpl.unregisterSubscriptionQueue(pendingEntries, walPinSupplier); + cleanUp(); + // Persist progress before closing + commitManager.persistAll(); + } + + private SubscriptionEvent generateErrorResponse(final String errorMessage) { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + new ErrorPayload(errorMessage, false), + new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID)); + } + + private SubscriptionEvent generateOutdatedErrorResponse() { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + ErrorPayload.OUTDATED_ERROR_PAYLOAD, + new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID)); + } + + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + return PipeDataNodeAgent.runtime().getRebootTimes() > commitContext.getRebootTimes() + || initialCommitId > commitContext.getCommitId(); + } + + // ======================== Status ======================== + + public boolean isClosed() { + return isClosed; + } + + public void markClosed() { + isClosed = true; + } + + public String getPrefetchingQueueId() { + return brokerId + "_" + topicName; + } + + public long getSubscriptionUncommittedEventCount() { + return inFlightEvents.size(); + } + + public long getCurrentCommitId() { + return commitIdGenerator.get(); + } + + public int getPrefetchedEventCount() { + return prefetchingQueue.size(); + } + + public long getCurrentReadSearchIndex() { + return nextExpectedSearchIndex.get(); + } + + public String getBrokerId() { + return brokerId; + } + + public String getTopicName() { + return topicName; + } + + public String getConsensusGroupId() { + return consensusGroupId; + } + + // ======================== Stringify ======================== + + public Map coreReportMessage() { + final Map result = new HashMap<>(); + result.put("brokerId", brokerId); + result.put("topicName", topicName); + result.put("consensusGroupId", consensusGroupId); + result.put("currentReadSearchIndex", String.valueOf(nextExpectedSearchIndex.get())); + result.put("prefetchingQueueSize", String.valueOf(prefetchingQueue.size())); + result.put("inFlightEventsSize", String.valueOf(inFlightEvents.size())); + result.put("outstandingEventsSize", String.valueOf(outstandingCommitIdToStartIndex.size())); + result.put("pendingEntriesSize", String.valueOf(pendingEntries.size())); + result.put("commitIdGenerator", commitIdGenerator.toString()); + result.put("isClosed", String.valueOf(isClosed)); + return result; + } + + @Override + public String toString() { + return "ConsensusPrefetchingQueue" + coreReportMessage(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java new file mode 100644 index 0000000000000..4096394ad6a33 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.db.conf.IoTDBDescriptor; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Manages commit state for consensus-based subscriptions. + * + *

This manager tracks which events have been committed by consumers and maps commit IDs back to + * WAL search indices. It maintains the progress for each (consumerGroup, topic, region) triple and + * supports persistence and recovery. + * + *

Progress is tracked per-region because searchIndex is region-local — each DataRegion + * has its own independent WAL with its own searchIndex namespace. Using a single state per topic + * would cause TreeSet deduplication bugs when different regions emit the same searchIndex value. + * + *

Key responsibilities: + * + *

    + *
  • Track the mapping from commitId to searchIndex + *
  • Handle commit/ack from consumers + *
  • Persist and recover progress state + *
+ */ +public class ConsensusSubscriptionCommitManager { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionCommitManager.class); + + private static final String PROGRESS_FILE_PREFIX = "consensus_subscription_progress_"; + private static final String PROGRESS_FILE_SUFFIX = ".dat"; + + /** Key: "consumerGroupId_topicName_regionId" -> progress tracking state */ + private final Map commitStates = + new ConcurrentHashMap<>(); + + private final String persistDir; + + private ConsensusSubscriptionCommitManager() { + this.persistDir = + IoTDBDescriptor.getInstance().getConfig().getSystemDir() + + File.separator + + "subscription" + + File.separator + + "consensus_progress"; + final File dir = new File(persistDir); + if (!dir.exists()) { + dir.mkdirs(); + } + } + + /** + * Gets or creates the commit state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @return the commit state + */ + public ConsensusSubscriptionCommitState getOrCreateState( + final String consumerGroupId, final String topicName, final String regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + return commitStates.computeIfAbsent( + key, + k -> { + // Try to recover from persisted state + final ConsensusSubscriptionCommitState recovered = tryRecover(key); + if (recovered != null) { + return recovered; + } + return new ConsensusSubscriptionCommitState(new SubscriptionConsensusProgress(0L, 0L)); + }); + } + + /** + * Records commitId to searchIndex mapping for later commit handling. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @param commitId the assigned commit ID + * @param searchIndex the WAL search index corresponding to this event + */ + public void recordCommitMapping( + final String consumerGroupId, + final String topicName, + final String regionId, + final long commitId, + final long searchIndex) { + final ConsensusSubscriptionCommitState state = + getOrCreateState(consumerGroupId, topicName, regionId); + state.recordMapping(commitId, searchIndex); + } + + /** + * Handles commit (ack) for an event. Updates the progress and potentially advances the committed + * search index. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @param commitId the committed event's commit ID + * @return true if commit handled successfully + */ + public boolean commit( + final String consumerGroupId, + final String topicName, + final String regionId, + final long commitId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot commit for unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}, commitId={}", + consumerGroupId, + topicName, + regionId, + commitId); + return false; + } + final boolean success = state.commit(commitId); + if (success) { + // Periodically persist progress + persistProgressIfNeeded(key, state); + } + return success; + } + + /** + * Gets the current committed search index for a specific region's state. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @return the committed search index, or -1 if no state exists + */ + public long getCommittedSearchIndex( + final String consumerGroupId, final String topicName, final String regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + return -1; + } + return state.getCommittedSearchIndex(); + } + + /** + * Removes state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + */ + public void removeState( + final String consumerGroupId, final String topicName, final String regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + commitStates.remove(key); + // Clean up persisted file + final File file = getProgressFile(key); + if (file.exists()) { + file.delete(); + } + } + + /** + * Removes all states for a given (consumerGroup, topic) pair across all regions. Used during + * subscription teardown when the individual regionIds may not be readily available. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + */ + public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) { + final String prefix = consumerGroupId + "_" + topicName + "_"; + final Iterator> it = + commitStates.entrySet().iterator(); + while (it.hasNext()) { + final Map.Entry entry = it.next(); + if (entry.getKey().startsWith(prefix)) { + it.remove(); + final File file = getProgressFile(entry.getKey()); + if (file.exists()) { + file.delete(); + } + } + } + } + + /** Persists all states. Should be called during graceful shutdown. */ + public void persistAll() { + for (final Map.Entry entry : + commitStates.entrySet()) { + persistProgress(entry.getKey(), entry.getValue()); + } + } + + // ======================== Helper Methods ======================== + + private String generateKey( + final String consumerGroupId, final String topicName, final String regionId) { + return consumerGroupId + "_" + topicName + "_" + regionId; + } + + private File getProgressFile(final String key) { + return new File(persistDir, PROGRESS_FILE_PREFIX + key + PROGRESS_FILE_SUFFIX); + } + + private ConsensusSubscriptionCommitState tryRecover(final String key) { + final File file = getProgressFile(key); + if (!file.exists()) { + return null; + } + try (final FileInputStream fis = new FileInputStream(file)) { + final byte[] bytes = new byte[(int) file.length()]; + fis.read(bytes); + final ByteBuffer buffer = ByteBuffer.wrap(bytes); + return ConsensusSubscriptionCommitState.deserialize(buffer); + } catch (final IOException e) { + LOGGER.warn("Failed to recover consensus subscription progress from {}", file, e); + return null; + } + } + + private void persistProgressIfNeeded( + final String key, final ConsensusSubscriptionCommitState state) { + // Persist every 100 commits to reduce disk IO + if (state.getProgress().getCommitIndex() % 100 == 0) { + persistProgress(key, state); + } + } + + private void persistProgress(final String key, final ConsensusSubscriptionCommitState state) { + final File file = getProgressFile(key); + try (final FileOutputStream fos = new FileOutputStream(file); + final DataOutputStream dos = new DataOutputStream(fos)) { + state.serialize(dos); + dos.flush(); + } catch (final IOException e) { + LOGGER.warn("Failed to persist consensus subscription progress to {}", file, e); + } + } + + // ======================== Inner State Class ======================== + + /** + * Tracks commit state for a single (consumerGroup, topic, region) triple. Maintains the mapping + * from commitId to searchIndex and tracks committed progress within one region's WAL. + */ + public static class ConsensusSubscriptionCommitState { + + private final SubscriptionConsensusProgress progress; + + /** + * Maps commitId -> searchIndex. Records which WAL search index corresponds to each committed + * event. Entries are removed once committed. + */ + private final Map commitIdToSearchIndex = new ConcurrentHashMap<>(); + + /** + * Tracks the safe recovery position: the highest search index where all prior dispatched events + * have been committed. Only advances contiguously — never jumps over uncommitted gaps. + */ + private volatile long committedSearchIndex; + + /** + * Tracks the maximum search index among all committed events (may be ahead of + * committedSearchIndex when out-of-order commits exist). Used to update committedSearchIndex + * once all outstanding events are committed. + */ + private long maxCommittedSearchIndex; + + /** + * Tracks search indices of dispatched but not-yet-committed events. Used to prevent + * committedSearchIndex from jumping over uncommitted gaps. On commit, the frontier advances to + * min(outstanding) - 1 (or maxCommittedSearchIndex if empty). + * + *

Since state is now per-region, searchIndex values within this set are guaranteed unique + * (they come from a single region's monotonically increasing WAL searchIndex). + */ + private final TreeSet outstandingSearchIndices = new TreeSet<>(); + + public ConsensusSubscriptionCommitState(final SubscriptionConsensusProgress progress) { + this.progress = progress; + this.committedSearchIndex = progress.getSearchIndex(); + this.maxCommittedSearchIndex = progress.getSearchIndex(); + } + + public SubscriptionConsensusProgress getProgress() { + return progress; + } + + public long getCommittedSearchIndex() { + return committedSearchIndex; + } + + /** Threshold for warning about outstanding (uncommitted) search indices accumulation. */ + private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000; + + public void recordMapping(final long commitId, final long searchIndex) { + commitIdToSearchIndex.put(commitId, searchIndex); + synchronized (this) { + outstandingSearchIndices.add(searchIndex); + final int size = outstandingSearchIndices.size(); + if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: outstandingSearchIndices size ({}) exceeds " + + "threshold ({}), consumers may not be committing. committedSearchIndex={}, " + + "maxCommittedSearchIndex={}, commitIdToSearchIndex size={}", + size, + OUTSTANDING_SIZE_WARN_THRESHOLD, + committedSearchIndex, + maxCommittedSearchIndex, + commitIdToSearchIndex.size()); + } + } + } + + /** + * Commits the specified event and advances the committed search index contiguously. + * + *

The committed search index only advances to a position where all prior dispatched events + * have been committed. This prevents the recovery position from jumping over uncommitted gaps, + * ensuring at-least-once delivery even after crash recovery. + * + * @param commitId the commit ID to commit + * @return true if successfully committed + */ + public boolean commit(final long commitId) { + final Long searchIndex = commitIdToSearchIndex.remove(commitId); + if (searchIndex == null) { + LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId); + return false; + } + + progress.incrementCommitIndex(); + + // Advance committed search index contiguously (gap-aware) + synchronized (this) { + outstandingSearchIndices.remove(searchIndex); + if (searchIndex > maxCommittedSearchIndex) { + maxCommittedSearchIndex = searchIndex; + } + + if (outstandingSearchIndices.isEmpty()) { + // All dispatched events have been committed — advance to the max + committedSearchIndex = maxCommittedSearchIndex; + } else { + // Advance to just below the earliest uncommitted event + // (never go backward) + committedSearchIndex = + Math.max(committedSearchIndex, outstandingSearchIndices.first() - 1); + } + progress.setSearchIndex(committedSearchIndex); + } + + return true; + } + + public void serialize(final DataOutputStream stream) throws IOException { + progress.serialize(stream); + stream.writeLong(committedSearchIndex); + } + + public static ConsensusSubscriptionCommitState deserialize(final ByteBuffer buffer) { + final SubscriptionConsensusProgress progress = + SubscriptionConsensusProgress.deserialize(buffer); + final ConsensusSubscriptionCommitState state = new ConsensusSubscriptionCommitState(progress); + state.committedSearchIndex = buffer.getLong(); + state.maxCommittedSearchIndex = state.committedSearchIndex; + return state; + } + } + + // ======================== Singleton ======================== + + private static class Holder { + private static final ConsensusSubscriptionCommitManager INSTANCE = + new ConsensusSubscriptionCommitManager(); + } + + public static ConsensusSubscriptionCommitManager getInstance() { + return Holder.INSTANCE; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java new file mode 100644 index 0000000000000..b138dbceef1a2 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.commons.pipe.datastructure.pattern.IoTDBTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.PrefixTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.consensus.IConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.conf.IoTDBConfig; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.consensus.DataRegionConsensusImpl; +import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Handles the setup and teardown of consensus-based subscription queues on DataNode. When a + * real-time subscription is detected, this handler finds the local IoTConsensus data regions, + * creates the appropriate converter, and binds prefetching queues to the subscription broker. + */ +public class ConsensusSubscriptionSetupHandler { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionSetupHandler.class); + + private static final IoTDBConfig IOTDB_CONFIG = IoTDBDescriptor.getInstance().getConfig(); + + private ConsensusSubscriptionSetupHandler() { + // utility class + } + + /** + * Ensures that the IoTConsensus new-peer callback is set, so that when a new DataRegion is + * created, all active consensus subscriptions are automatically bound to the new region. + */ + public static void ensureNewRegionListenerRegistered() { + if (IoTConsensus.onNewPeerCreated != null) { + return; + } + IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated; + LOGGER.info( + "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding"); + } + + /** + * Callback invoked when a new DataRegion (IoTConsensusServerImpl) is created locally. Queries + * existing subscription metadata to find all active consensus subscriptions and binds prefetching + * queues to the new region. + */ + private static void onNewRegionCreated( + final ConsensusGroupId groupId, final IoTConsensusServerImpl serverImpl) { + if (!(groupId instanceof DataRegionId)) { + return; + } + + // Query existing metadata keepers for all active subscriptions + final Map> allSubscriptions = + SubscriptionAgent.consumer().getAllSubscriptions(); + if (allSubscriptions.isEmpty()) { + return; + } + + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + final long startSearchIndex = serverImpl.getSearchIndex() + 1; + + LOGGER.info( + "New DataRegion {} created, checking {} consumer group(s) for auto-binding, " + + "startSearchIndex={}", + groupId, + allSubscriptions.size(), + startSearchIndex); + + for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) { + final String consumerGroupId = groupEntry.getKey(); + for (final String topicName : groupEntry.getValue()) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + try { + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + continue; + } + + // Resolve the new DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + // For table topics, skip if this region's database doesn't match the topic filter + if (topicConfig.isTableTopic()) { + final String topicDb = + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE); + if (topicDb != null + && !topicDb.isEmpty() + && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDb) + && !topicDb.equalsIgnoreCase(dbTableModel)) { + continue; + } + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + + LOGGER.info( + "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} (database={})", + topicName, + consumerGroupId, + groupId, + dbTableModel); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + groupId.toString(), + serverImpl, + converter, + commitManager, + startSearchIndex); + } catch (final Exception e) { + LOGGER.error( + "Failed to auto-bind topic [{}] in group [{}] to new region {}", + topicName, + consumerGroupId, + groupId, + e); + } + } + } + } + + public static boolean isConsensusBasedTopic(final String topicName) { + try { + final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName); + final String topicFormat = SubscriptionAgent.topic().getTopicFormat(topicName); + final boolean result = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + LOGGER.info( + "isConsensusBasedTopic check for topic [{}]: mode={}, format={}, result={}", + topicName, + topicMode, + topicFormat, + result); + return result; + } catch (final Exception e) { + LOGGER.warn( + "Failed to check if topic [{}] is consensus-based, defaulting to false", topicName, e); + return false; + } + } + + public static void setupConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + final IConsensus dataRegionConsensus = DataRegionConsensusImpl.getInstance(); + if (!(dataRegionConsensus instanceof IoTConsensus)) { + LOGGER.warn( + "Data region consensus is not IoTConsensus (actual: {}), " + + "cannot set up consensus-based subscription for consumer group [{}]", + dataRegionConsensus.getClass().getSimpleName(), + consumerGroupId); + return; + } + + // Ensure the new-region listener is registered (idempotent) + ensureNewRegionListenerRegistered(); + + final IoTConsensus ioTConsensus = (IoTConsensus) dataRegionConsensus; + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + + LOGGER.info( + "Setting up consensus subscriptions for consumer group [{}], topics={}, " + + "total consensus groups={}", + consumerGroupId, + topicNames, + ioTConsensus.getAllConsensusGroupIds().size()); + + for (final String topicName : topicNames) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + + try { + setupConsensusQueueForTopic(consumerGroupId, topicName, ioTConsensus, commitManager); + } catch (final Exception e) { + LOGGER.error( + "Failed to set up consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + /** + * Set up consensus queue for a single topic. Discovers all local data region consensus groups and + * binds a ConsensusReqReader-based prefetching queue to every matching region. + * + *

For table-model topics, only regions whose database matches the topic's {@code DATABASE_KEY} + * filter are bound. For tree-model topics, all local data regions are bound. Additionally, the + * {@link #onNewRegionCreated} callback ensures that regions created after this method runs are + * also automatically bound. + */ + private static void setupConsensusQueueForTopic( + final String consumerGroupId, + final String topicName, + final IoTConsensus ioTConsensus, + final ConsensusSubscriptionCommitManager commitManager) { + + // Get topic config for building the converter + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + LOGGER.warn( + "Topic config not found for topic [{}], cannot set up consensus queue", topicName); + return; + } + + // Build the converter based on topic config (path pattern, time range, tree/table model) + LOGGER.info( + "Setting up consensus queue for topic [{}]: isTableTopic={}, config={}", + topicName, + topicConfig.isTableTopic(), + topicConfig.getAttribute()); + + // For table topics, extract the database filter from topic config + final String topicDatabaseFilter = + topicConfig.isTableTopic() + ? topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE) + : null; + + final List allGroupIds = ioTConsensus.getAllConsensusGroupIds(); + LOGGER.info( + "Discovered {} consensus group(s) for topic [{}] in consumer group [{}]: {}", + allGroupIds.size(), + topicName, + consumerGroupId, + allGroupIds); + boolean bound = false; + + for (final ConsensusGroupId groupId : allGroupIds) { + if (!(groupId instanceof DataRegionId)) { + continue; + } + + final IoTConsensusServerImpl serverImpl = ioTConsensus.getImpl(groupId); + if (serverImpl == null) { + continue; + } + + // Resolve the DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + if (topicDatabaseFilter != null + && !topicDatabaseFilter.isEmpty() + && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDatabaseFilter) + && !topicDatabaseFilter.equalsIgnoreCase(dbTableModel)) { + LOGGER.info( + "Skipping region {} (database={}) for table topic [{}] (DATABASE_KEY={})", + groupId, + dbTableModel, + topicName, + topicDatabaseFilter); + continue; + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + + final long startSearchIndex = serverImpl.getSearchIndex() + 1; + + LOGGER.info( + "Binding consensus prefetching queue for topic [{}] in consumer group [{}] " + + "to data region consensus group [{}] (database={}), startSearchIndex={}", + topicName, + consumerGroupId, + groupId, + dbTableModel, + startSearchIndex); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + groupId.toString(), + serverImpl, + converter, + commitManager, + startSearchIndex); + + bound = true; + } + + if (!bound) { + LOGGER.warn( + "No local IoTConsensus data region found for topic [{}] in consumer group [{}]. " + + "Consensus subscription will be set up when a matching data region becomes available.", + topicName, + consumerGroupId); + } + } + + private static ConsensusLogToTabletConverter buildConverter( + final TopicConfig topicConfig, final String actualDatabaseName) { + // Determine tree or table model + final boolean isTableTopic = topicConfig.isTableTopic(); + + TreePattern treePattern = null; + TablePattern tablePattern = null; + + if (isTableTopic) { + // Table model: database + table name pattern + final String database = + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE); + final String table = + topicConfig.getStringOrDefault( + TopicConstant.TABLE_KEY, TopicConstant.TABLE_DEFAULT_VALUE); + tablePattern = new TablePattern(true, database, table); + } else { + // Tree model: path or pattern + if (topicConfig.getAttribute().containsKey(TopicConstant.PATTERN_KEY)) { + final String pattern = topicConfig.getAttribute().get(TopicConstant.PATTERN_KEY); + treePattern = new PrefixTreePattern(pattern); + } else { + final String path = + topicConfig.getStringOrDefault( + TopicConstant.PATH_KEY, TopicConstant.PATH_DEFAULT_VALUE); + treePattern = new IoTDBTreePattern(path); + } + } + + return new ConsensusLogToTabletConverter(treePattern, tablePattern, actualDatabaseName); + } + + public static void teardownConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + for (final String topicName : topicNames) { + try { + SubscriptionAgent.broker().unbindConsensusPrefetchingQueue(consumerGroupId, topicName); + + // Clean up commit state for all regions of this topic + ConsensusSubscriptionCommitManager.getInstance() + .removeAllStatesForTopic(consumerGroupId, topicName); + + LOGGER.info( + "Tore down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId); + } catch (final Exception e) { + LOGGER.warn( + "Failed to tear down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + public static void handleNewSubscriptions( + final String consumerGroupId, final Set newTopicNames) { + if (newTopicNames == null || newTopicNames.isEmpty()) { + return; + } + + LOGGER.info( + "Checking new subscriptions in consumer group [{}] for consensus-based topics: {}", + consumerGroupId, + newTopicNames); + + setupConsensusSubscriptions(consumerGroupId, newTopicNames); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java new file mode 100644 index 0000000000000..0bd526e8dbaa0 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +/** + * Tracks consensus subscription consumption progress for a single (consumerGroup, topic, region) + * combination. + * + *

Since searchIndex is region-local (each DataRegion has its own independent WAL and searchIndex + * namespace), progress is tracked per-region: + * + *

    + *
  • searchIndex: The committed WAL search index — the highest position where all prior + * dispatched events have been acknowledged. Used as the recovery start point after crash. + *
  • commitIndex: Monotonically increasing count of committed events. Used for + * persistence throttling and diagnostics. + *
+ */ +public class SubscriptionConsensusProgress { + + private long searchIndex; + + private long commitIndex; + + public SubscriptionConsensusProgress() { + this(0L, 0L); + } + + public SubscriptionConsensusProgress(final long searchIndex, final long commitIndex) { + this.searchIndex = searchIndex; + this.commitIndex = commitIndex; + } + + public long getSearchIndex() { + return searchIndex; + } + + public void setSearchIndex(final long searchIndex) { + this.searchIndex = searchIndex; + } + + public long getCommitIndex() { + return commitIndex; + } + + public void setCommitIndex(final long commitIndex) { + this.commitIndex = commitIndex; + } + + public void incrementCommitIndex() { + this.commitIndex++; + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(searchIndex, stream); + ReadWriteIOUtils.write(commitIndex, stream); + } + + public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) { + final long searchIndex = ReadWriteIOUtils.readLong(buffer); + final long commitIndex = ReadWriteIOUtils.readLong(buffer); + return new SubscriptionConsensusProgress(searchIndex, commitIndex); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o; + return searchIndex == that.searchIndex && commitIndex == that.commitIndex; + } + + @Override + public int hashCode() { + return Objects.hash(searchIndex, commitIndex); + } + + @Override + public String toString() { + return "SubscriptionConsensusProgress{" + + "searchIndex=" + + searchIndex + + ", commitIndex=" + + commitIndex + + '}'; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java index dfadee5908fa5..9ede61fbffe74 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java @@ -248,6 +248,11 @@ public void nack() { } } + /** Returns the current nack count for this event. */ + public long getNackCount() { + return nackCount.get(); + } + public void recordLastPolledConsumerId(final String consumerId) { lastPolledConsumerId = consumerId; } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java index c7e7fea8d12f8..9e9c898e3c064 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java @@ -30,7 +30,7 @@ public class SubscriptionConfig { private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig(); public boolean getSubscriptionEnabled() { - return false; + return true; // TODO: make it configurable after subscription is stable } public float getSubscriptionCacheMemoryUsagePercentage() { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java index 4393ef8a6cf61..9f66b48210bc2 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java @@ -115,6 +115,26 @@ private boolean shouldRecordSubscriptionCreationTime() { return unsubscribedTopicNames; } + public static Set getTopicsNewlySubByGroup( + final ConsumerGroupMeta currentMeta, final ConsumerGroupMeta updatedMeta) { + if (!Objects.equals(currentMeta.consumerGroupId, updatedMeta.consumerGroupId) + || !Objects.equals(currentMeta.creationTime, updatedMeta.creationTime)) { + return Collections.emptySet(); + } + + final Set newlySubscribedTopicNames = new HashSet<>(); + updatedMeta + .topicNameToSubscribedConsumerIdSet + .keySet() + .forEach( + topicName -> { + if (!currentMeta.topicNameToSubscribedConsumerIdSet.containsKey(topicName)) { + newlySubscribedTopicNames.add(topicName); + } + }); + return newlySubscribedTopicNames; + } + /////////////////////////////// consumer /////////////////////////////// public void checkAuthorityBeforeJoinConsumerGroup(final ConsumerMeta consumerMeta) @@ -171,6 +191,11 @@ public ConsumerMeta getConsumerMeta(final String consumerId) { ////////////////////////// subscription ////////////////////////// + /** Get all topic names subscribed by this consumer group. */ + public Set getSubscribedTopicNames() { + return Collections.unmodifiableSet(topicNameToSubscribedConsumerIdSet.keySet()); + } + /** * Get the consumers subscribing the given topic in this group. * From 36e3491dbce10884c570bef2fa7bc902aff938a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E5=AD=90=E5=9D=A4?= <55695098+DanielWang2035@users.noreply.github.com> Date: Tue, 3 Mar 2026 18:59:10 +0800 Subject: [PATCH 2/2] fix some issues --- .../iotdb/ConsensusSubscriptionTableTest.java | 985 +++++++-------- .../iotdb/ConsensusSubscriptionTest.java | 1062 +++++++---------- .../iotdb/consensus/iot/IoTConsensus.java | 19 + .../consensus/iot/IoTConsensusServerImpl.java | 2 +- .../iot/logdispatcher/LogDispatcher.java | 12 +- .../agent/SubscriptionBrokerAgent.java | 18 +- .../broker/ConsensusSubscriptionBroker.java | 29 +- .../ConsensusLogToTabletConverter.java | 135 ++- .../consensus/ConsensusPrefetchingQueue.java | 122 +- .../ConsensusSubscriptionCommitManager.java | 29 +- .../ConsensusSubscriptionSetupHandler.java | 70 +- .../SubscriptionConsensusProgress.java | 32 +- 12 files changed, 1221 insertions(+), 1294 deletions(-) diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java index 6c1da0199f663..ade06c96e6f8d 100644 --- a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java @@ -44,6 +44,10 @@ import java.util.Map; import java.util.Properties; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; /** TODO: Move these manual tests into ITs */ public class ConsensusSubscriptionTableTest { @@ -63,50 +67,32 @@ public static void main(String[] args) throws Exception { String targetTest = args.length > 0 ? args[0] : null; - if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) { - runTest("testBasicDataDelivery", ConsensusSubscriptionTableTest::testBasicDataDelivery); + if (targetTest == null || "testBasicFlow".equals(targetTest)) { + runTest("testBasicFlow", ConsensusSubscriptionTableTest::testBasicFlow); } - if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) { - runTest("testMultipleDataTypes", ConsensusSubscriptionTableTest::testMultipleDataTypes); + if (targetTest == null || "testDataTypes".equals(targetTest)) { + runTest("testDataTypes", ConsensusSubscriptionTableTest::testDataTypes); } - if (targetTest == null || "testTableLevelFiltering".equals(targetTest)) { - runTest("testTableLevelFiltering", ConsensusSubscriptionTableTest::testTableLevelFiltering); - } - if (targetTest == null || "testDatabaseLevelFiltering".equals(targetTest)) { - runTest( - "testDatabaseLevelFiltering", ConsensusSubscriptionTableTest::testDatabaseLevelFiltering); + if (targetTest == null || "testPathFiltering".equals(targetTest)) { + runTest("testPathFiltering", ConsensusSubscriptionTableTest::testPathFiltering); } if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { runTest( "testSubscribeBeforeRegion", ConsensusSubscriptionTableTest::testSubscribeBeforeRegion); } - if (targetTest == null || "testMultipleTablesAggregation".equals(targetTest)) { - runTest( - "testMultipleTablesAggregation", - ConsensusSubscriptionTableTest::testMultipleTablesAggregation); - } - if (targetTest == null || "testMultiColumnTypes".equals(targetTest)) { - runTest("testMultiColumnTypes", ConsensusSubscriptionTableTest::testMultiColumnTypes); + if (targetTest == null || "testRedelivery".equals(targetTest)) { + runTest("testRedelivery", ConsensusSubscriptionTableTest::testRedelivery); } - if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) { - runTest("testPollWithoutCommit", ConsensusSubscriptionTableTest::testPollWithoutCommit); + if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) { + runTest("testMultiEntityIsolation", ConsensusSubscriptionTableTest::testMultiEntityIsolation); } - if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) { + if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) { runTest( - "testMultiConsumerGroupIndependent", - ConsensusSubscriptionTableTest::testMultiConsumerGroupIndependent); + "testBurstWriteGapRecovery", ConsensusSubscriptionTableTest::testBurstWriteGapRecovery); } - if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) { + if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) { runTest( - "testMultiTopicSubscription", ConsensusSubscriptionTableTest::testMultiTopicSubscription); - } - if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) { - runTest("testFlushDataDelivery", ConsensusSubscriptionTableTest::testFlushDataDelivery); - } - if (targetTest == null || "testCrossPartitionMultiWrite".equals(targetTest)) { - runTest( - "testCrossPartitionMultiWrite", - ConsensusSubscriptionTableTest::testCrossPartitionMultiWrite); + "testCommitAfterUnsubscribe", ConsensusSubscriptionTableTest::testCommitAfterUnsubscribe); } // Summary @@ -459,14 +445,20 @@ private static void assertAtLeast(String msg, int min, int actual) { } } - // ============================ - // Test 1: Basic Data Delivery - // ============================ + // ====================================================================== + // Test 1: Basic Flow (merged: BasicDataDelivery + MultiTables + Flush) + // ====================================================================== /** - * Verifies the basic consensus subscription flow with table model: write before subscribe (not - * received), write after subscribe (received), and no extra data beyond expectation. + * Verifies: + * + *
    + *
  • Data written BEFORE subscribe is NOT received + *
  • Multiple tables (t1, t2, t3) written AFTER subscribe are all received + *
  • Flush does not cause data loss (WAL pinning keeps entries available) + *
  • Exact row count matches expectation + *
*/ - private static void testBasicDataDelivery() throws Exception { + private static void testBasicFlow() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); @@ -474,18 +466,19 @@ private static void testBasicDataDelivery() throws Exception { ISubscriptionTablePullConsumer consumer = null; try { - // Step 1: Write initial data to create DataRegion + // Step 1: Write initial data to create DataRegion (should NOT be received) System.out.println(" Step 1: Writing initial data (should NOT be received)"); try (ITableSession session = openTableSession()) { - createDatabaseAndTable( - session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD, s2 DOUBLE FIELD"); + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)"); for (int i = 0; i < 50; i++) { session.executeNonQueryStatement( - String.format( - "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)", - i * 10, i * 1.5, i)); + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); } + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)"); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); @@ -499,44 +492,60 @@ private static void testBasicDataDelivery() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - // Step 3: Write new data AFTER subscription - System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)"); + // Step 3: Write to 3 tables (30 rows each = 90 total), then flush + System.out.println(" Step 3: Writing 30 rows x 3 tables AFTER subscribe, then flush"); try (ITableSession session = openTableSession()) { session.executeNonQueryStatement("USE " + database); - for (int i = 100; i < 200; i++) { + for (int i = 100; i < 130; i++) { session.executeNonQueryStatement( - String.format( - "INSERT INTO t1 (tag1, s1, s2, time) VALUES ('d1', %d, %f, %d)", - i * 10, i * 1.5, i)); + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - // Step 4: Poll and verify exact count + // Step 4: Poll and verify System.out.println(" Step 4: Polling..."); - PollResult result = pollUntilComplete(consumer, 100, 100); + PollResult result = pollUntilComplete(consumer, 90, 100); System.out.println(" Result: " + result); - assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows); + assertEquals("Expected exactly 90 rows (30 per table)", 90, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + System.out.println(" Rows per table: " + result.rowsPerTable); + for (String tbl : new String[] {"t1", "t2", "t3"}) { + Integer tblRows = result.rowsPerTable.get(tbl); + assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0); + } + } } finally { cleanup(consumer, topicName, database); } } - // ============================ - // Test 2: Multiple Data Types - // ============================ + // ====================================================================== + // Test 2: Data Types (merged: MultipleDataTypes + MultiColumnTypes + CrossPartition) + // ====================================================================== /** - * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using - * separate INSERT statements per type (one field per INSERT), and verifies all types are - * delivered. + * Verifies: + * + *
    + *
  • Non-aligned: 6 data types via separate INSERTs + *
  • All-column: 6 fields in a single INSERT + *
  • Cross-partition: timestamps >1 week apart via SQL, Tablet methods + *
*/ - private static void testMultipleDataTypes() throws Exception { + private static void testDataTypes() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); String consumerId = nextConsumerId(); ISubscriptionTablePullConsumer consumer = null; + final long GAP = 604_800_001L; // slightly over 1 week try { try (ITableSession session = openTableSession()) { @@ -548,9 +557,10 @@ private static void testMultipleDataTypes() throws Exception { + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " + "s_text TEXT FIELD"); session.executeNonQueryStatement("USE " + database); - // Write initial row to create DataRegion + // Init row to force DataRegion creation session.executeNonQueryStatement( - "INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', 0, 0)"); + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)"); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); @@ -562,9 +572,12 @@ private static void testMultipleDataTypes() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - System.out.println(" Writing data with 6 data types x 20 rows each"); + int totalExpected = 0; try (ITableSession session = openTableSession()) { session.executeNonQueryStatement("USE " + database); + + // --- Part A: 6 data types x 20 rows, separate INSERTs --- + System.out.println(" Part A: 6 data types x 20 rows (separate INSERTs)"); for (int i = 1; i <= 20; i++) { session.executeNonQueryStatement( String.format("INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', %d, %d)", i, i)); @@ -586,94 +599,115 @@ private static void testMultipleDataTypes() throws Exception { String.format( "INSERT INTO t1 (tag1, s_text, time) VALUES ('d1', 'text_%d', %d)", i, i)); } - } - Thread.sleep(2000); + totalExpected += 120; // 6 types x 20 rows - System.out.println(" Polling..."); - PollResult result = pollUntilComplete(consumer, 120, 120); - System.out.println(" Result: " + result); + // --- Part B: All-column rows (50 rows) --- + System.out.println(" Part B: 50 all-column rows"); + for (int i = 21; i <= 70; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)" + + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)", + i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i)); + } + totalExpected += 50; - assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows); - System.out.println(" Seen columns: " + result.seenColumns); - assertTrue( - "Expected multiple column types in result, got: " + result.seenColumns, - result.seenColumns.size() > 1); - } finally { - cleanup(consumer, topicName, database); - } - } + // --- Part C: Cross-partition writes --- + System.out.println(" Part C: Cross-partition (SQL single, multi, Tablet)"); + long baseTs = 1_000_000_000L; - // ============================ - // Test 3: Table-Level Filtering - // ============================ - /** - * Creates a topic that only matches table "t1" via TABLE_KEY. Verifies that data written to t2 is - * NOT delivered. - */ - private static void testTableLevelFiltering() throws Exception { - String database = nextDatabase(); - String topicName = nextTopic(); - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - ISubscriptionTablePullConsumer consumer = null; + // SQL single-row x2 + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'xp_single_1', %d)", + baseTs)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'xp_single_2', %d)", + baseTs + GAP)); + totalExpected += 2; - try { - try (ITableSession session = openTableSession()) { - createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); - session.executeNonQueryStatement("USE " + database); - session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); - session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); - session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); - session.executeNonQueryStatement("flush"); - } - Thread.sleep(2000); + // SQL multi-row x3 + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'xp_multi_1', %d), " + + "('d1', 4, 400, 4.4, 4.44, false, 'xp_multi_2', %d), " + + "('d1', 5, 500, 5.5, 5.55, true, 'xp_multi_3', %d)", + baseTs + GAP * 2, baseTs + GAP * 3, baseTs + GAP * 4)); + totalExpected += 3; - // Topic matches only table t1 - createTopicTable(topicName, database, "t1"); - Thread.sleep(1000); + // Tablet x4 + List schemaList = new ArrayList<>(); + schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING)); + schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING)); - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName); - Thread.sleep(3000); + List categories = + java.util.Arrays.asList( + ColumnCategory.TAG, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD); - System.out.println(" Writing to both t1 and t2 (topic filter: t1 only)"); - try (ITableSession session = openTableSession()) { - session.executeNonQueryStatement("USE " + database); - for (int i = 100; i < 150; i++) { - session.executeNonQueryStatement( - String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); - session.executeNonQueryStatement( - String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + Tablet tablet = + new Tablet( + "t1", + IMeasurementSchema.getMeasurementNameList(schemaList), + IMeasurementSchema.getDataTypeList(schemaList), + categories, + 10); + for (int i = 0; i < 4; i++) { + int row = tablet.getRowSize(); + long ts = baseTs + GAP * (5 + i); + tablet.addTimestamp(row, ts); + tablet.addValue("tag1", row, "d1"); + tablet.addValue("s_int32", row, 6 + i); + tablet.addValue("s_int64", row, (long) (600 + i * 100)); + tablet.addValue("s_float", row, (6 + i) * 1.1f); + tablet.addValue("s_double", row, (6 + i) * 2.22); + tablet.addValue("s_bool", row, i % 2 == 0); + tablet.addValue("s_text", row, "xp_tablet_" + (i + 1)); } + session.insert(tablet); + totalExpected += 4; } + + System.out.println(" Total expected rows: " + totalExpected); Thread.sleep(2000); - System.out.println(" Polling (expecting only t1 data)..."); - PollResult result = pollUntilComplete(consumer, 50, 60); + PollResult result = pollUntilComplete(consumer, totalExpected, 200); System.out.println(" Result: " + result); - assertEquals("Expected exactly 50 rows from t1 only", 50, result.totalRows); - if (!result.rowsPerTable.isEmpty()) { - Integer t2Rows = result.rowsPerTable.get("t2"); - assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0); - Integer t1Rows = result.rowsPerTable.get("t1"); - assertAtLeast("Expected t1 rows", 1, t1Rows != null ? t1Rows : 0); - System.out.println( - " Table filtering verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows"); - } + assertAtLeast( + "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows); + assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size()); } finally { cleanup(consumer, topicName, database); } } - // ============================ - // Test 4: Database-Level Filtering - // ============================ + // ====================================================================== + // Test 3: Path Filtering (merged: TableLevel + DatabaseLevel) + // ====================================================================== /** - * Creates a topic that only matches database db1 via DATABASE_KEY. Verifies that data written to - * db2 is NOT delivered. + * Verifies: + * + *
    + *
  • Table-level: topic on table=t1 does NOT deliver t2 data + *
  • Database-level: topic on db1 does NOT deliver db2 data + *
*/ - private static void testDatabaseLevelFiltering() throws Exception { + private static void testPathFiltering() throws Exception { String database1 = nextDatabase(); String database2 = database1 + "_other"; String topicName = nextTopic(); @@ -683,77 +717,68 @@ private static void testDatabaseLevelFiltering() throws Exception { try { try (ITableSession session = openTableSession()) { + // db1 with t1 and t2 createDatabaseAndTable(session, database1, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); - createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); session.executeNonQueryStatement("USE " + database1); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + // db2 with t1 + createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); session.executeNonQueryStatement("USE " + database2); session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - // Topic matches only database1 - createTopicTable(topicName, database1, ".*"); + // Topic: only db1, only table t1 + createTopicTable(topicName, database1, "t1"); Thread.sleep(1000); consumer = createConsumer(consumerId, consumerGroupId); consumer.subscribe(topicName); Thread.sleep(3000); - System.out.println( - " Writing to both " - + database1 - + " and " - + database2 - + " (topic filter: " - + database1 - + " only)"); + System.out.println(" Writing to db1.t1, db1.t2, db2.t1 (topic filter: db1.t1 only)"); try (ITableSession session = openTableSession()) { session.executeNonQueryStatement("USE " + database1); for (int i = 100; i < 150; i++) { session.executeNonQueryStatement( String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); } session.executeNonQueryStatement("USE " + database2); for (int i = 100; i < 150; i++) { session.executeNonQueryStatement( - String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); } } Thread.sleep(2000); - System.out.println(" Polling (expecting only " + database1 + " data)..."); + System.out.println(" Polling (expecting only db1.t1 data = 50 rows)..."); PollResult result = pollUntilComplete(consumer, 50, 60); System.out.println(" Result: " + result); - assertEquals("Expected exactly 50 rows from " + database1 + " only", 50, result.totalRows); + assertEquals("Expected exactly 50 rows from db1.t1 only", 50, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + Integer t2Rows = result.rowsPerTable.get("t2"); + assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0); + System.out.println(" Table filtering verified: t1 only"); + } if (!result.rowsPerDatabase.isEmpty()) { Integer db2Rows = result.rowsPerDatabase.get(database2); - assertTrue( - "Expected NO rows from " + database2 + ", but got " + db2Rows, - db2Rows == null || db2Rows == 0); - Integer db1Rows = result.rowsPerDatabase.get(database1); - assertAtLeast("Expected " + database1 + " rows", 1, db1Rows != null ? db1Rows : 0); - System.out.println( - " Database filtering verified: " - + database1 - + "=" - + db1Rows - + " rows, " - + database2 - + "=" - + db2Rows - + " rows"); + assertTrue("Expected NO rows from " + database2, db2Rows == null || db2Rows == 0); + System.out.println(" Database filtering verified: " + database1 + " only"); } } finally { cleanup(consumer, topicName, database1, database2); } } - // ============================ - // Test 5: Subscribe Before Region Creation - // ============================ + // ====================================================================== + // Test 4: Subscribe Before Region Creation (kept as-is) + // ====================================================================== /** * Subscribe BEFORE the database/region exists, then create database and write. Tests the * IoTConsensus.onNewPeerCreated auto-binding path with table model. @@ -786,7 +811,7 @@ private static void testSubscribeBeforeRegion() throws Exception { } Thread.sleep(5000); - System.out.println(" Step 4: Polling (auto-binding should have picked up new region)..."); + System.out.println(" Step 4: Polling..."); PollResult result = pollUntilComplete(consumer, 100, 100); System.out.println(" Result: " + result); @@ -805,11 +830,11 @@ private static void testSubscribeBeforeRegion() throws Exception { } } - // ============================ - // Test 6: Multiple Tables Aggregation - // ============================ - /** Writes to t1, t2, t3 and verifies all are received via a broad topic TABLE_KEY. */ - private static void testMultipleTablesAggregation() throws Exception { + // ====================================================================== + // Test 5: Redelivery / At-Least-Once (kept as-is from testPollWithoutCommit) + // ====================================================================== + /** Tests at-least-once delivery with a mixed commit/no-commit pattern. */ + private static void testRedelivery() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); @@ -820,11 +845,7 @@ private static void testMultipleTablesAggregation() throws Exception { try (ITableSession session = openTableSession()) { createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); session.executeNonQueryStatement("USE " + database); - session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); - session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)"); session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); - session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); - session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)"); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); @@ -836,148 +857,6 @@ private static void testMultipleTablesAggregation() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - System.out.println(" Writing to 3 tables (t1, t2, t3), 30 rows each"); - try (ITableSession session = openTableSession()) { - session.executeNonQueryStatement("USE " + database); - for (int i = 100; i < 130; i++) { - session.executeNonQueryStatement( - String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); - session.executeNonQueryStatement( - String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); - session.executeNonQueryStatement( - String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); - } - } - Thread.sleep(2000); - - System.out.println(" Polling (expecting 90 total from 3 tables)..."); - PollResult result = pollUntilComplete(consumer, 90, 100); - System.out.println(" Result: " + result); - - assertEquals("Expected exactly 90 rows total (30 per table)", 90, result.totalRows); - if (!result.rowsPerTable.isEmpty()) { - System.out.println(" Rows per table: " + result.rowsPerTable); - for (String tbl : new String[] {"t1", "t2", "t3"}) { - Integer tblRows = result.rowsPerTable.get(tbl); - assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0); - } - } - } finally { - cleanup(consumer, topicName, database); - } - } - - // ============================ - // Test 7: Multi Column Types (Table Model Equivalent of Aligned Timeseries) - // ============================ - /** - * Creates a table with 6 different FIELD types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and - * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are - * delivered correctly. This is the table model equivalent of the aligned timeseries test. - */ - private static void testMultiColumnTypes() throws Exception { - String database = nextDatabase(); - String topicName = nextTopic(); - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - ISubscriptionTablePullConsumer consumer = null; - - try { - // Create table with multiple field types - try (ITableSession session = openTableSession()) { - createDatabaseAndTable( - session, - database, - "t1", - "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, " - + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " - + "s_text TEXT FIELD"); - session.executeNonQueryStatement("USE " + database); - // Write initial row to force DataRegion creation - session.executeNonQueryStatement( - "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " - + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)"); - session.executeNonQueryStatement("flush"); - } - Thread.sleep(2000); - - createTopicTable(topicName, database, ".*"); - Thread.sleep(1000); - - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName); - Thread.sleep(3000); - - // Write 50 rows, each with all 6 data types in a single INSERT - System.out.println(" Writing 50 rows with 6 data types per row"); - try (ITableSession session = openTableSession()) { - session.executeNonQueryStatement("USE " + database); - for (int i = 1; i <= 50; i++) { - session.executeNonQueryStatement( - String.format( - "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)" - + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)", - i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i)); - } - } - Thread.sleep(2000); - - System.out.println(" Polling..."); - PollResult result = pollUntilComplete(consumer, 50, 70); - System.out.println(" Result: " + result); - - assertEquals("Expected exactly 50 rows with all field types", 50, result.totalRows); - // Verify we see columns for multiple data types - System.out.println(" Seen columns: " + result.seenColumns); - assertAtLeast( - "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size()); - } finally { - cleanup(consumer, topicName, database); - } - } - - // ============================ - // Test 8: Poll Without Commit (Re-delivery) - // ============================ - /** - * Tests at-least-once delivery with a mixed commit/no-commit pattern. - * - *

Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we - * track committed ROWS (not events). The state machine alternates: - * - *

    - *
  • Even-numbered rounds: poll WITHOUT commit, record ALL timestamps from the event; next - * poll verifies the EXACT SAME timestamps are re-delivered, then commit. - *
  • Odd-numbered rounds: poll and commit directly; next poll should deliver DIFFERENT data. - *
- * - *

This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal - * commit path in an interleaved fashion. - */ - private static void testPollWithoutCommit() throws Exception { - String database = nextDatabase(); - String topicName = nextTopic(); - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - ISubscriptionTablePullConsumer consumer = null; - - try { - try (ITableSession session = openTableSession()) { - createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); - session.executeNonQueryStatement("USE " + database); - session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); - session.executeNonQueryStatement("flush"); - } - Thread.sleep(2000); - - createTopicTable(topicName, database, ".*"); - Thread.sleep(1000); - - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName); - Thread.sleep(3000); - - // Write 50 rows final int totalRows = 50; System.out.println(" Writing " + totalRows + " rows"); try (ITableSession session = openTableSession()) { @@ -989,7 +868,6 @@ private static void testPollWithoutCommit() throws Exception { } Thread.sleep(3000); - // State machine: alternate between skip-commit and direct-commit. int totalRowsCommitted = 0; int roundNumber = 0; boolean hasPending = false; @@ -1005,7 +883,6 @@ private static void testPollWithoutCommit() throws Exception { } for (SubscriptionMessage msg : msgs) { - // Extract ALL timestamps from this event List currentTimestamps = new ArrayList<>(); for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { while (ds.hasNext()) { @@ -1015,7 +892,6 @@ private static void testPollWithoutCommit() throws Exception { assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0); if (hasPending) { - // === Re-delivery round: verify EXACT same timestamps === assertTrue( "Re-delivery timestamp list mismatch: expected=" + pendingTimestamps @@ -1036,7 +912,6 @@ private static void testPollWithoutCommit() throws Exception { + "] Re-delivered & committed: timestamps=" + currentTimestamps); } else { - // === New event round === if (totalRowsCommitted > 0) { boolean overlap = false; for (Long ts : currentTimestamps) { @@ -1046,12 +921,7 @@ private static void testPollWithoutCommit() throws Exception { } } assertTrue( - "After commit, should receive different data (timestamps=" - + currentTimestamps - + " overlap with committed=" - + allCommittedTimestamps - + ")", - !overlap); + "After commit, should receive different data (overlap detected)", !overlap); } if (roundNumber % 2 == 0) { @@ -1086,7 +956,6 @@ private static void testPollWithoutCommit() throws Exception { "Should have at least 1 re-delivery round (got " + redeliveryCount + ")", redeliveryCount > 0); - // Final poll: should be empty System.out.println(" Final poll: expecting no data"); int extraRows = 0; for (int i = 0; i < 3; i++) { @@ -1101,7 +970,6 @@ private static void testPollWithoutCommit() throws Exception { } } assertEquals("After all committed, should receive no more data", 0, extraRows); - System.out.println( " At-least-once re-delivery verified: " + totalRows @@ -1113,16 +981,22 @@ private static void testPollWithoutCommit() throws Exception { } } - // ============================ - // Test 9: Multi Consumer Group Independent Consumption - // ============================ + // ====================================================================== + // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic) + // ====================================================================== /** - * Two consumer groups subscribe to the same topic. Verifies that each group independently - * receives ALL data (data is not partitioned/split between groups). + * Verifies: + * + *

    + *
  • Two consumer groups on same topic: each group gets ALL data independently + *
  • One consumer subscribes to two topics with different TABLE_KEY filters: each topic + * delivers only matching data + *
*/ - private static void testMultiConsumerGroupIndependent() throws Exception { + private static void testMultiEntityIsolation() throws Exception { String database = nextDatabase(); - String topicName = nextTopic(); + String topicName1 = "topic_tbl_multi_" + testCounter + "_a"; + String topicName2 = "topic_tbl_multi_" + testCounter + "_b"; String consumerGroupId1 = "cg_tbl_multi_" + testCounter + "_a"; String consumerId1 = "consumer_tbl_multi_" + testCounter + "_a"; String consumerGroupId2 = "cg_tbl_multi_" + testCounter + "_b"; @@ -1131,163 +1005,94 @@ private static void testMultiConsumerGroupIndependent() throws Exception { ISubscriptionTablePullConsumer consumer2 = null; try { - // Create database and initial data + // Setup: database with t1 and t2 try (ITableSession session = openTableSession()) { createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - createTopicTable(topicName, database, ".*"); + // Topic 1: covers t1 only, Topic 2: covers t2 only + createTopicTable(topicName1, database, "t1"); + createTopicTable(topicName2, database, "t2"); Thread.sleep(1000); - // Two consumers in different groups both subscribe to the same topic + // Consumer 1 (group A): subscribes to BOTH topics consumer1 = createConsumer(consumerId1, consumerGroupId1); - consumer1.subscribe(topicName); + consumer1.subscribe(topicName1, topicName2); + // Consumer 2 (group B): subscribes to BOTH topics consumer2 = createConsumer(consumerId2, consumerGroupId2); - consumer2.subscribe(topicName); + consumer2.subscribe(topicName1, topicName2); Thread.sleep(3000); - // Write 50 rows - System.out.println(" Writing 50 rows"); + // Write 30 rows to t1, 40 rows to t2 + System.out.println(" Writing 30 rows to t1, 40 rows to t2"); try (ITableSession session = openTableSession()) { session.executeNonQueryStatement("USE " + database); - for (int i = 1; i <= 50; i++) { + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } session.executeNonQueryStatement( - String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); } } Thread.sleep(2000); - // Poll from group 1 - System.out.println(" Polling from consumer group 1..."); - PollResult result1 = pollUntilComplete(consumer1, 50, 70); + // Part A: Both groups should get 70 rows independently + System.out.println(" Part A: Multi-group isolation"); + System.out.println(" Polling from group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 70, 80); System.out.println(" Group 1 result: " + result1); - // Poll from group 2 - System.out.println(" Polling from consumer group 2..."); - PollResult result2 = pollUntilComplete(consumer2, 50, 70); + System.out.println(" Polling from group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 70, 80); System.out.println(" Group 2 result: " + result2); - // Both groups should have all 50 rows - assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows); - assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows); + assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows); + assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows); + + // Part B: Verify per-topic table isolation + if (!result1.rowsPerTable.isEmpty()) { + Integer t1Rows = result1.rowsPerTable.get("t1"); + Integer t2Rows = result1.rowsPerTable.get("t2"); + assertEquals("Expected 30 rows from t1 (topic1)", 30, t1Rows != null ? t1Rows : 0); + assertEquals("Expected 40 rows from t2 (topic2)", 40, t2Rows != null ? t2Rows : 0); + System.out.println(" Multi-topic isolation verified: t1=" + t1Rows + ", t2=" + t2Rows); + } System.out.println( - " Independent consumption verified: group1=" + " Multi-group isolation verified: group1=" + result1.totalRows + ", group2=" + result2.totalRows); } finally { - // Clean up both consumers if (consumer1 != null) { try { - consumer1.unsubscribe(topicName); + consumer1.unsubscribe(topicName1, topicName2); } catch (Exception e) { - // ignore + /* ignore */ } try { consumer1.close(); } catch (Exception e) { - // ignore + /* ignore */ } } if (consumer2 != null) { try { - consumer2.unsubscribe(topicName); + consumer2.unsubscribe(topicName1, topicName2); } catch (Exception e) { - // ignore + /* ignore */ } try { consumer2.close(); } catch (Exception e) { - // ignore - } - } - dropTopicTable(topicName); - deleteDatabase(database); - } - } - - // ============================ - // Test 10: Multi Topic Subscription - // ============================ - /** - * One consumer subscribes to two different topics with different TABLE_KEY filters. Verifies that - * each topic delivers only its matching data, and no cross-contamination occurs. - */ - private static void testMultiTopicSubscription() throws Exception { - String database = nextDatabase(); - String topicName1 = "topic_tbl_multi_" + testCounter + "_a"; - String topicName2 = "topic_tbl_multi_" + testCounter + "_b"; - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - ISubscriptionTablePullConsumer consumer = null; - - try { - // Create database with two tables - try (ITableSession session = openTableSession()) { - createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); - session.executeNonQueryStatement("USE " + database); - session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); - session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); - session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); - session.executeNonQueryStatement("flush"); - } - Thread.sleep(2000); - - // Topic 1: covers t1 only - createTopicTable(topicName1, database, "t1"); - // Topic 2: covers t2 only - createTopicTable(topicName2, database, "t2"); - Thread.sleep(1000); - - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName1, topicName2); - Thread.sleep(3000); - - // Write 30 rows to t1 and 40 rows to t2 - System.out.println(" Writing 30 rows to t1, 40 rows to t2"); - try (ITableSession session = openTableSession()) { - session.executeNonQueryStatement("USE " + database); - for (int i = 1; i <= 40; i++) { - if (i <= 30) { - session.executeNonQueryStatement( - String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); - } - session.executeNonQueryStatement( - String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); - } - } - Thread.sleep(2000); - - // Poll all data — should get t1 rows (via topic1) + t2 rows (via topic2) - System.out.println(" Polling (expecting 30 from t1 + 40 from t2 = 70 total)..."); - PollResult result = pollUntilComplete(consumer, 70, 80); - System.out.println(" Result: " + result); - - assertEquals("Expected exactly 70 rows total (30 t1 + 40 t2)", 70, result.totalRows); - if (!result.rowsPerTable.isEmpty()) { - Integer t1Rows = result.rowsPerTable.get("t1"); - Integer t2Rows = result.rowsPerTable.get("t2"); - assertEquals("Expected 30 rows from t1", 30, t1Rows != null ? t1Rows : 0); - assertEquals("Expected 40 rows from t2", 40, t2Rows != null ? t2Rows : 0); - System.out.println( - " Multi-topic isolation verified: t1=" + t1Rows + " rows, t2=" + t2Rows + " rows"); - } - } finally { - // Clean up consumer, both topics, and database - if (consumer != null) { - try { - consumer.unsubscribe(topicName1, topicName2); - } catch (Exception e) { - // ignore - } - try { - consumer.close(); - } catch (Exception e) { - // ignore + /* ignore */ } } dropTopicTable(topicName1); @@ -1296,51 +1101,40 @@ private static void testMultiTopicSubscription() throws Exception { } } - // ============================ - // Test 12: Cross-Partition Multi-Write - // ============================ + // ====================================================================== + // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix) + // ====================================================================== /** - * Tests that cross-partition writes via all table model write methods are correctly delivered. + * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The + * pending queue overflow triggers gaps, which should be recovered from WAL. * - *

Uses timestamps spaced >1 week apart (default partition interval = 604,800,000ms) to force - * cross-partition distribution. Exercises three write paths: + *

Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one + * {@code pendingEntries.offer()}. A single {@code session.insert(tablet)} with N rows in one time + * partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To actually + * overflow, we need 4096+ individual write() calls arriving faster than the prefetch + * thread can drain. We achieve this with multiple concurrent writer threads, each performing + * individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate. * - *

    - *
  • Method 1: SQL single-row INSERT (2 rows, separate partitions) - *
  • Method 2: SQL multi-row INSERT (3 rows spanning 3 partitions in one statement) - *
  • Method 3: session.insert(Tablet) with 4 rows spanning 4 partitions - *
+ *

Note: Gap occurrence is inherently timing-dependent (race between writers and the + * prefetch drain loop). This test maximizes the probability by using concurrent threads, but + * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling + * from WAL" messages to confirm the gap path was exercised. * - *

The table has 6 FIELD columns (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) plus 1 TAG. Total - * expected rows: 2 + 3 + 4 = 9. - * - *

This test verifies that when a SQL multi-row INSERT or Tablet write spans multiple time - * partitions (causing the plan node to be split into sub-nodes for each partition), all sub-nodes - * are correctly converted by the consensus subscription pipeline. + *

Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to + * the next prefetch iteration. */ - private static void testCrossPartitionMultiWrite() throws Exception { + private static void testBurstWriteGapRecovery() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); String consumerId = nextConsumerId(); ISubscriptionTablePullConsumer consumer = null; - // Gap > default time partition interval (7 days = 604,800,000ms) - final long GAP = 604_800_001L; - final String TABLE = "t1"; - final String SCHEMA = - "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, " - + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " - + "s_text TEXT FIELD"; - try { - // Create database and table, write init row to force DataRegion creation try (ITableSession session = openTableSession()) { - createDatabaseAndTable(session, database, TABLE, SCHEMA); + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); session.executeNonQueryStatement("USE " + database); - session.executeNonQueryStatement( - "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " - + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)"); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); @@ -1352,123 +1146,92 @@ private static void testCrossPartitionMultiWrite() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - System.out.println(" Writing cross-partition data via 3 methods..."); + // Use multiple concurrent writer threads with individual SQL INSERTs. + // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer(). + // With N threads writing concurrently, aggregate rate should exceed drain rate + // and overflow the 4096-capacity queue, creating gaps. + final int writerThreads = 4; + final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096 + final int totalRows = writerThreads * rowsPerThread; + final AtomicInteger errorCount = new AtomicInteger(0); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(writerThreads); - // --- Method 1: SQL single-row INSERT (2 rows, each in its own partition) --- - long baseTs = 1_000_000_000L; - try (ITableSession session = openTableSession()) { - session.executeNonQueryStatement("USE " + database); - long ts1 = baseTs; - long ts2 = baseTs + GAP; - System.out.println(" Method 1: SQL single-row x2 (ts=" + ts1 + ", " + ts2 + ")"); - session.executeNonQueryStatement( - String.format( - "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " - + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'sql_single_1', %d)", - ts1)); - session.executeNonQueryStatement( - String.format( - "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " - + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'sql_single_2', %d)", - ts2)); - } - - // --- Method 2: SQL multi-row INSERT (3 rows spanning 3 different partitions) --- - try (ITableSession session = openTableSession()) { - session.executeNonQueryStatement("USE " + database); - long t1 = baseTs + GAP * 2; - long t2 = baseTs + GAP * 3; - long t3 = baseTs + GAP * 4; - System.out.println( - " Method 2: SQL multi-row x3 (ts=" + t1 + ", " + t2 + ", " + t3 + ")"); - session.executeNonQueryStatement( - String.format( - "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " - + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'sql_multi_1', %d), " - + "('d1', 4, 400, 4.4, 4.44, false, 'sql_multi_2', %d), " - + "('d1', 5, 500, 5.5, 5.55, true, 'sql_multi_3', %d)", - t1, t2, t3)); + System.out.println( + " Burst writing " + + totalRows + + " rows via " + + writerThreads + + " concurrent threads (" + + rowsPerThread + + " individual SQL INSERTs each)"); + System.out.println( + " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)"); + + ExecutorService executor = Executors.newFixedThreadPool(writerThreads); + for (int t = 0; t < writerThreads; t++) { + final int threadId = t; + final int startTs = threadId * rowsPerThread + 1; + executor.submit( + () -> { + try { + startLatch.await(); // all threads start at the same time + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < rowsPerThread; i++) { + int ts = startTs + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", + (long) ts * 10, ts)); + } + } + } catch (Exception e) { + System.out.println(" Writer thread " + threadId + " error: " + e.getMessage()); + errorCount.incrementAndGet(); + } finally { + doneLatch.countDown(); + } + }); } - // --- Method 3: session.insert(Tablet) with 4 rows spanning 4 partitions --- - try (ITableSession session = openTableSession()) { - session.executeNonQueryStatement("USE " + database); - - List schemaList = new ArrayList<>(); - schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING)); - schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32)); - schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64)); - schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); - schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); - schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); - schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING)); - - List categories = - java.util.Arrays.asList( - ColumnCategory.TAG, - ColumnCategory.FIELD, - ColumnCategory.FIELD, - ColumnCategory.FIELD, - ColumnCategory.FIELD, - ColumnCategory.FIELD, - ColumnCategory.FIELD); - - Tablet tablet = - new Tablet( - TABLE, - IMeasurementSchema.getMeasurementNameList(schemaList), - IMeasurementSchema.getDataTypeList(schemaList), - categories, - 10); + // Fire all threads simultaneously + startLatch.countDown(); + doneLatch.await(); + executor.shutdown(); - for (int i = 0; i < 4; i++) { - int row = tablet.getRowSize(); - long ts = baseTs + GAP * (5 + i); // partitions 5, 6, 7, 8 - tablet.addTimestamp(row, ts); - tablet.addValue("tag1", row, "d1"); - tablet.addValue("s_int32", row, 6 + i); - tablet.addValue("s_int64", row, (long) (600 + i * 100)); - tablet.addValue("s_float", row, (6 + i) * 1.1f); - tablet.addValue("s_double", row, (6 + i) * 2.22); - tablet.addValue("s_bool", row, i % 2 == 0); - tablet.addValue("s_text", row, "tablet_" + (i + 1)); - } - System.out.println( - " Method 3: Tablet x4 (ts=" + (baseTs + GAP * 5) + ".." + (baseTs + GAP * 8) + ")"); - session.insert(tablet); + if (errorCount.get() > 0) { + System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors"); } - Thread.sleep(2000); - - // Poll — expect 9 rows total (2 + 3 + 4) - final int expectedRows = 9; - System.out.println(" Polling (expecting " + expectedRows + " rows)..."); - PollResult result = pollUntilComplete(consumer, expectedRows, 80); + // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes + System.out.println( + " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)..."); + System.out.println( + " (Check server logs for 'gap detected' to confirm gap recovery was triggered)"); + PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true); System.out.println(" Result: " + result); assertEquals( - "Expected exactly " + expectedRows + " cross-partition rows", - expectedRows, + "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)", + totalRows, result.totalRows); - // Verify we see all 6 FIELD columns plus tag - assertAtLeast( - "Expected at least 6 data columns in cross-partition result", - 6, - result.seenColumns.size()); } finally { cleanup(consumer, topicName, database); } } - // ============================ - // Test 11: Flush Data Delivery - // ============================ + // ====================================================================== + // Test 8: Commit After Unsubscribe (NEW — tests H7 fix) + // ====================================================================== /** - * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable - * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps - * entries available until committed by the subscription consumer. + * Tests that commit still works correctly after the consumer has unsubscribed (queue has been + * torn down). The commit routing should use metadata-based topic config check instead of runtime + * queue state. + * + *

Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue(). */ - private static void testFlushDataDelivery() throws Exception { + private static void testCommitAfterUnsubscribe() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); @@ -1491,26 +1254,76 @@ private static void testFlushDataDelivery() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - // Write 50 rows, then flush before polling - System.out.println(" Writing 50 rows then flushing"); + // Write data + System.out.println(" Writing 50 rows"); try (ITableSession session = openTableSession()) { session.executeNonQueryStatement("USE " + database); for (int i = 1; i <= 50; i++) { session.executeNonQueryStatement( String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); } - System.out.println(" Flushing..."); - session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - // Poll — all 50 rows should be delivered despite flush - System.out.println(" Polling after flush..."); - PollResult result = pollUntilComplete(consumer, 50, 70); - System.out.println(" Result: " + result); - assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows); + // Poll WITHOUT commit + System.out.println(" Polling WITHOUT commit..."); + List uncommittedMessages = new ArrayList<>(); + int polledRows = 0; + for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + if (polledRows > 0) break; + Thread.sleep(500); + continue; + } + for (SubscriptionMessage msg : msgs) { + uncommittedMessages.add(msg); + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + polledRows++; + } + } + } + } + System.out.println( + " Polled " + + polledRows + + " rows, holding " + + uncommittedMessages.size() + + " uncommitted messages"); + assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows); + + // Unsubscribe (tears down the consensus queue) + System.out.println(" Unsubscribing (queue teardown)..."); + consumer.unsubscribe(topicName); + Thread.sleep(2000); + + // Now commit the previously polled messages — should NOT throw + System.out.println( + " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe..."); + boolean commitSucceeded = true; + for (SubscriptionMessage msg : uncommittedMessages) { + try { + consumer.commitSync(msg); + } catch (Exception e) { + System.out.println(" Commit threw exception: " + e.getMessage()); + commitSucceeded = false; + } + } + + System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded); + System.out.println(" (Key: no exception crash, routing handled gracefully)"); } finally { - cleanup(consumer, topicName, database); + if (consumer != null) { + try { + consumer.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopicTable(topicName); + deleteDatabase(database); } } } diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java index 1ab7a910c0324..501b789edd738 100644 --- a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java @@ -43,6 +43,10 @@ import java.util.Map; import java.util.Properties; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; /** TODO: move these manual tests into ITs */ public class ConsensusSubscriptionTest { @@ -62,46 +66,29 @@ public static void main(String[] args) throws Exception { String targetTest = args.length > 0 ? args[0] : null; - if (targetTest == null || "testBasicDataDelivery".equals(targetTest)) { - runTest("testBasicDataDelivery", ConsensusSubscriptionTest::testBasicDataDelivery); + if (targetTest == null || "testBasicFlow".equals(targetTest)) { + runTest("testBasicFlow", ConsensusSubscriptionTest::testBasicFlow); } - if (targetTest == null || "testMultipleDataTypes".equals(targetTest)) { - runTest("testMultipleDataTypes", ConsensusSubscriptionTest::testMultipleDataTypes); + if (targetTest == null || "testDataTypes".equals(targetTest)) { + runTest("testDataTypes", ConsensusSubscriptionTest::testDataTypes); } - if (targetTest == null || "testDeviceLevelFiltering".equals(targetTest)) { - runTest("testDeviceLevelFiltering", ConsensusSubscriptionTest::testDeviceLevelFiltering); - } - if (targetTest == null || "testTimeseriesLevelFiltering".equals(targetTest)) { - runTest( - "testTimeseriesLevelFiltering", ConsensusSubscriptionTest::testTimeseriesLevelFiltering); + if (targetTest == null || "testPathFiltering".equals(targetTest)) { + runTest("testPathFiltering", ConsensusSubscriptionTest::testPathFiltering); } if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { runTest("testSubscribeBeforeRegion", ConsensusSubscriptionTest::testSubscribeBeforeRegion); } - if (targetTest == null || "testMultipleDevicesAggregation".equals(targetTest)) { - runTest( - "testMultipleDevicesAggregation", - ConsensusSubscriptionTest::testMultipleDevicesAggregation); - } - if (targetTest == null || "testAlignedTimeseries".equals(targetTest)) { - runTest("testAlignedTimeseries", ConsensusSubscriptionTest::testAlignedTimeseries); - } - if (targetTest == null || "testPollWithoutCommit".equals(targetTest)) { - runTest("testPollWithoutCommit", ConsensusSubscriptionTest::testPollWithoutCommit); - } - if (targetTest == null || "testMultiConsumerGroupIndependent".equals(targetTest)) { - runTest( - "testMultiConsumerGroupIndependent", - ConsensusSubscriptionTest::testMultiConsumerGroupIndependent); + if (targetTest == null || "testRedelivery".equals(targetTest)) { + runTest("testRedelivery", ConsensusSubscriptionTest::testRedelivery); } - if (targetTest == null || "testMultiTopicSubscription".equals(targetTest)) { - runTest("testMultiTopicSubscription", ConsensusSubscriptionTest::testMultiTopicSubscription); + if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) { + runTest("testMultiEntityIsolation", ConsensusSubscriptionTest::testMultiEntityIsolation); } - if (targetTest == null || "testFlushDataDelivery".equals(targetTest)) { - runTest("testFlushDataDelivery", ConsensusSubscriptionTest::testFlushDataDelivery); + if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) { + runTest("testBurstWriteGapRecovery", ConsensusSubscriptionTest::testBurstWriteGapRecovery); } - if (targetTest == null || "testCrossPartitionAligned".equals(targetTest)) { - runTest("testCrossPartitionAligned", ConsensusSubscriptionTest::testCrossPartitionAligned); + if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) { + runTest("testCommitAfterUnsubscribe", ConsensusSubscriptionTest::testCommitAfterUnsubscribe); } // Summary @@ -407,14 +394,20 @@ private static void assertAtLeast(String msg, int min, int actual) { } } - // ============================ - // Test 1: Basic Data Delivery - // ============================ + // ====================================================================== + // Test 1: Basic Flow (merged: BasicDataDelivery + MultiDevices + Flush) + // ====================================================================== /** - * Verifies the basic consensus subscription flow: write before subscribe (not received), write - * after subscribe (received), and no extra data beyond expectation. + * Verifies: + * + *

    + *
  • Data written BEFORE subscribe is NOT received + *
  • Multiple devices (d1, d2, d3) written AFTER subscribe are all received + *
  • Flush does not cause data loss (WAL pinning keeps entries available) + *
  • Exact row count matches expectation + *
*/ - private static void testBasicDataDelivery() throws Exception { + private static void testBasicFlow() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); @@ -422,16 +415,19 @@ private static void testBasicDataDelivery() throws Exception { SubscriptionTreePullConsumer consumer = null; try { - // Step 1: Write initial data to create DataRegion + // Step 1: Write initial data to create DataRegion (should NOT be received) System.out.println(" Step 1: Writing initial data (should NOT be received)"); try (ISession session = openSession()) { createDatabase(session, database); for (int i = 0; i < 50; i++) { session.executeNonQueryStatement( - String.format( - "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)", - database, i, i * 10, i * 1.5)); + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); } + // Also write to d2, d3 for multi-device readiness + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database)); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); @@ -445,48 +441,79 @@ private static void testBasicDataDelivery() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - // Step 3: Write new data AFTER subscription - System.out.println(" Step 3: Writing new data AFTER subscription (100 rows)"); + // Step 3: Write to 3 devices (30 rows each = 90 total), then flush + System.out.println(" Step 3: Writing 30 rows x 3 devices AFTER subscribe, then flush"); try (ISession session = openSession()) { - for (int i = 100; i < 200; i++) { + for (int i = 100; i < 130; i++) { session.executeNonQueryStatement( - String.format( - "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %f)", - database, i, i * 10, i * 1.5)); + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30)); } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - // Step 4: Poll and verify exact count (also verifies no extra data) + // Step 4: Poll and verify System.out.println(" Step 4: Polling..."); - PollResult result = pollUntilComplete(consumer, 100, 100); + PollResult result = pollUntilComplete(consumer, 90, 100); System.out.println(" Result: " + result); - assertEquals("Expected exactly 100 rows from post-subscribe writes", 100, result.totalRows); + assertEquals("Expected exactly 90 rows (30 per device)", 90, result.totalRows); + if (!result.rowsPerDevice.isEmpty()) { + System.out.println(" Rows per device: " + result.rowsPerDevice); + for (String dev : new String[] {"d1", "d2", "d3"}) { + Integer devRows = result.rowsPerDevice.get(database + "." + dev); + assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0); + } + } } finally { cleanup(consumer, topicName, database); } } - // ============================ - // Test 2: Multiple Data Types (Non-Aligned) - // ============================ + // ====================================================================== + // Test 2: Data Types (merged: MultipleDataTypes + Aligned + CrossPartition) + // ====================================================================== /** - * Writes data with multiple data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) using - * separate INSERT statements per type (non-aligned), and verifies all types are delivered. + * Verifies: + * + *
    + *
  • Non-aligned: 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) + *
  • Aligned: 6 data types, cross-partition timestamps (>1 week apart) + *
  • 6 write methods: SQL single/multi-row, insertAlignedRecord/Records/Tablet/Tablets + *
*/ - private static void testMultipleDataTypes() throws Exception { + private static void testDataTypes() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); String consumerId = nextConsumerId(); SubscriptionTreePullConsumer consumer = null; + final long GAP = 604_800_001L; // slightly over 1 week try { try (ISession session = openSession()) { createDatabase(session, database); + // Create aligned timeseries + session.executeNonQueryStatement( + String.format( + "CREATE ALIGNED TIMESERIES %s.d_aligned" + + "(s_int32 INT32, s_int64 INT64, s_float FLOAT," + + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", + database)); + // Init rows to force DataRegion creation session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s_int32) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')", + database)); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); @@ -498,8 +525,29 @@ private static void testMultipleDataTypes() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - System.out.println(" Writing data with 6 data types x 20 rows each"); + int totalExpected = 0; + final String device = database + ".d_aligned"; + List measurements = + Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text"); + List types = + Arrays.asList( + TSDataType.INT32, + TSDataType.INT64, + TSDataType.FLOAT, + TSDataType.DOUBLE, + TSDataType.BOOLEAN, + TSDataType.TEXT); + List schemas = new ArrayList<>(); + schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT)); + try (ISession session = openSession()) { + // --- Part A: Non-aligned, 6 types x 20 rows --- + System.out.println(" Part A: Non-aligned 6 data types x 20 rows"); for (int i = 1; i <= 20; i++) { session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s_int32) VALUES (%d, %d)", database, i, i)); @@ -521,93 +569,103 @@ private static void testMultipleDataTypes() throws Exception { String.format( "INSERT INTO %s.d1(time, s_text) VALUES (%d, 'text_%d')", database, i, i)); } - } - Thread.sleep(2000); - - System.out.println(" Polling..."); - PollResult result = pollUntilComplete(consumer, 120, 120); - System.out.println(" Result: " + result); - - assertAtLeast("Expected at least 20 rows with multiple data types", 20, result.totalRows); - System.out.println(" Seen columns: " + result.seenColumns); - assertTrue( - "Expected multiple column types in result, got: " + result.seenColumns, - result.seenColumns.size() > 1); - } finally { - cleanup(consumer, topicName, database); - } - } + totalExpected += 120; // 6 types x 20 rows - // ============================ - // Test 3: Device-Level Filtering - // ============================ - /** - * Creates a topic that only matches root.db.d1.** and verifies that data written to d2 is NOT - * delivered. - */ - private static void testDeviceLevelFiltering() throws Exception { - String database = nextDatabase(); - String topicName = nextTopic(); - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - SubscriptionTreePullConsumer consumer = null; + // --- Part B: Aligned cross-partition, 6 write methods --- + System.out.println(" Part B: Aligned cross-partition, 6 write methods"); - try { - try (ISession session = openSession()) { - createDatabase(session, database); + // Method 1: SQL single row + long t1 = 1; session.executeNonQueryStatement( - String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')", + database, t1)); + totalExpected += 1; + + // Method 2: SQL multi-row (cross-partition) + long t2a = 1 + GAP; + long t2b = 1 + 2 * GAP; session.executeNonQueryStatement( - String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); - session.executeNonQueryStatement("flush"); - } - Thread.sleep(2000); + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a')," + + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')", + database, t2a, t2b)); + totalExpected += 2; - String filterPath = database + ".d1.**"; - createTopic(topicName, filterPath); - Thread.sleep(1000); + // Method 3: insertAlignedRecord + long t3 = 1 + 3 * GAP; + session.insertAlignedRecord( + device, + t3, + measurements, + types, + Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single")); + totalExpected += 1; - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName); - Thread.sleep(3000); + // Method 4: insertAlignedRecordsOfOneDevice (cross-partition) + long t4a = 1 + 4 * GAP; + long t4b = 1 + 5 * GAP; + session.insertAlignedRecordsOfOneDevice( + device, + Arrays.asList(t4a, t4b), + Arrays.asList(measurements, measurements), + Arrays.asList(types, types), + Arrays.asList( + Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"), + Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b"))); + totalExpected += 2; - System.out.println(" Writing to both d1 and d2 (topic filter: d1.** only)"); - try (ISession session = openSession()) { - for (int i = 100; i < 150; i++) { - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); - } + // Method 5: insertAlignedTablet (cross-partition) + long t5a = 1 + 6 * GAP; + long t5b = 1 + 7 * GAP; + Tablet tablet5 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a"); + addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b"); + session.insertAlignedTablet(tablet5); + totalExpected += 2; + + // Method 6: insertAlignedTablets (cross-partition) + long t6a = 1 + 8 * GAP; + long t6b = 1 + 9 * GAP; + Tablet tablet6 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a"); + addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b"); + Map tabletMap = new HashMap<>(); + tabletMap.put(device, tablet6); + session.insertAlignedTablets(tabletMap); + totalExpected += 2; } + + System.out.println(" Total expected rows: " + totalExpected); Thread.sleep(2000); - System.out.println(" Polling (expecting only d1 data)..."); - PollResult result = pollUntilComplete(consumer, 50, 60); + PollResult result = pollUntilComplete(consumer, totalExpected, 150); System.out.println(" Result: " + result); - assertEquals("Expected exactly 50 rows from d1 only", 50, result.totalRows); - if (!result.rowsPerDevice.isEmpty()) { - Integer d2Rows = result.rowsPerDevice.get(database + ".d2"); - assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0); - Integer d1Rows = result.rowsPerDevice.get(database + ".d1"); - assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0); - System.out.println( - " Device filtering verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows"); - } + assertAtLeast( + "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows); + assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size()); } finally { cleanup(consumer, topicName, database); } } - // ============================ - // Test 4: Timeseries-Level Filtering - // ============================ + // ====================================================================== + // Test 3: Path Filtering (merged: DeviceLevel + TimeseriesLevel) + // ====================================================================== /** - * Creates a topic matching root.db.d1.s1 only. Tests whether the converter filters at measurement - * level. Lenient: if both s1 and s2 arrive, reports device-level-only filtering. + * Verifies: + * + *
    + *
  • Device-level: topic on d1.** does NOT deliver d2 data + *
  • Timeseries-level: topic on d1.s1 — lenient check for s2 filtering + *
*/ - private static void testTimeseriesLevelFiltering() throws Exception { + private static void testPathFiltering() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); @@ -619,10 +677,13 @@ private static void testTimeseriesLevelFiltering() throws Exception { createDatabase(session, database); session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s1, s2) VALUES (0, 0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); + // Topic filters d1.s1 only (timeseries-level) String filterPath = database + ".d1.s1"; createTopic(topicName, filterPath); Thread.sleep(1000); @@ -631,39 +692,50 @@ private static void testTimeseriesLevelFiltering() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - System.out.println(" Writing to d1.s1 and d1.s2 (topic filter: d1.s1 only)"); + System.out.println(" Writing to d1 (s1 + s2) and d2 (s1)"); try (ISession session = openSession()) { for (int i = 100; i < 150; i++) { session.executeNonQueryStatement( String.format( "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %d)", database, i, i * 10, i * 20)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 30)); } } Thread.sleep(2000); - System.out.println(" Polling (expecting only s1 data)..."); + System.out.println(" Polling (expecting d1 data only, ideally s1 only)..."); PollResult result = pollUntilComplete(consumer, 50, 60); System.out.println(" Result: " + result); - System.out.println(" Seen columns: " + result.seenColumns); + // Device-level: d2 must NOT appear + if (!result.rowsPerDevice.isEmpty()) { + Integer d2Rows = result.rowsPerDevice.get(database + ".d2"); + assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0); + Integer d1Rows = result.rowsPerDevice.get(database + ".d1"); + assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0); + System.out.println(" Device filtering verified: d1=" + d1Rows + ", d2=" + d2Rows); + } + + // Timeseries-level: lenient check boolean hasS2 = result.seenColumns.stream().anyMatch(c -> c.contains(".s2")); if (hasS2) { System.out.println( " INFO: Both s1 and s2 received — converter uses device-level filtering only."); - assertAtLeast("Should have received some rows", 50, result.totalRows); + assertAtLeast("Should have received d1 rows", 50, result.totalRows); } else { System.out.println(" Timeseries-level filtering verified: only s1 data received"); - assertEquals("Expected exactly 50 rows from s1 only", 50, result.totalRows); + assertEquals("Expected exactly 50 rows from d1.s1 only", 50, result.totalRows); } } finally { cleanup(consumer, topicName, database); } } - // ============================ - // Test 5: Subscribe Before Region Creation - // ============================ + // ====================================================================== + // Test 4: Subscribe Before Region Creation (kept as-is) + // ====================================================================== /** * Subscribe BEFORE the database/region exists, then create database and write. Tests the * IoTConsensus.onNewPeerCreated auto-binding path. @@ -695,7 +767,7 @@ private static void testSubscribeBeforeRegion() throws Exception { } Thread.sleep(5000); - System.out.println(" Step 4: Polling (auto-binding should have picked up new region)..."); + System.out.println(" Step 4: Polling..."); PollResult result = pollUntilComplete(consumer, 100, 100); System.out.println(" Result: " + result); @@ -714,11 +786,20 @@ private static void testSubscribeBeforeRegion() throws Exception { } } - // ============================ - // Test 6: Multiple Devices Aggregation - // ============================ - /** Writes to d1, d2, d3 and verifies all are received via a broad topic path. */ - private static void testMultipleDevicesAggregation() throws Exception { + // ====================================================================== + // Test 5: Redelivery / At-Least-Once (kept as-is from testPollWithoutCommit) + // ====================================================================== + /** + * Tests at-least-once delivery with a mixed commit/no-commit pattern. + * + *

Writes 50 rows. Alternates between: + * + *

    + *
  • Even rounds: poll WITHOUT commit → next poll verifies same timestamps → commit + *
  • Odd rounds: poll and commit directly → next poll should deliver DIFFERENT data + *
+ */ + private static void testRedelivery() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); @@ -730,10 +811,6 @@ private static void testMultipleDevicesAggregation() throws Exception { createDatabase(session, database); session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database)); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); @@ -745,194 +822,41 @@ private static void testMultipleDevicesAggregation() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - System.out.println(" Writing to 3 devices (d1, d2, d3), 30 rows each"); + final int totalRows = 50; + System.out.println(" Writing " + totalRows + " rows"); try (ISession session = openSession()) { - for (int i = 100; i < 130; i++) { + for (int i = 1; i <= totalRows; i++) { session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30)); } } - Thread.sleep(2000); + Thread.sleep(3000); - System.out.println(" Polling (expecting 90 total from 3 devices)..."); - PollResult result = pollUntilComplete(consumer, 90, 100); - System.out.println(" Result: " + result); + int totalRowsCommitted = 0; + int roundNumber = 0; + boolean hasPending = false; + List pendingTimestamps = new ArrayList<>(); + Set allCommittedTimestamps = new HashSet<>(); + int redeliveryCount = 0; - assertEquals("Expected exactly 90 rows total (30 per device)", 90, result.totalRows); - if (!result.rowsPerDevice.isEmpty()) { - System.out.println(" Rows per device: " + result.rowsPerDevice); - for (String dev : new String[] {"d1", "d2", "d3"}) { - Integer devRows = result.rowsPerDevice.get(database + "." + dev); - assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0); + for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(5000)); + if (msgs.isEmpty()) { + Thread.sleep(1000); + continue; } - } - } finally { - cleanup(consumer, topicName, database); - } - } - // ============================ - // Test 7: Aligned Timeseries - // ============================ - /** - * Creates aligned timeseries with 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) and - * writes rows where each INSERT contains ALL columns. Verifies all rows and all column types are - * delivered correctly. - */ - private static void testAlignedTimeseries() throws Exception { - String database = nextDatabase(); - String topicName = nextTopic(); - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - SubscriptionTreePullConsumer consumer = null; - - try { - // Create aligned timeseries with multiple data types - try (ISession session = openSession()) { - createDatabase(session, database); - session.executeNonQueryStatement( - String.format( - "CREATE ALIGNED TIMESERIES %s.d_aligned" - + "(s_int32 INT32, s_int64 INT64, s_float FLOAT," - + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", - database)); - // Write initial row to force DataRegion creation - session.executeNonQueryStatement( - String.format( - "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," - + " s_double, s_bool, s_text)" - + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')", - database)); - session.executeNonQueryStatement("flush"); - } - Thread.sleep(2000); - - createTopic(topicName, database + ".**"); - Thread.sleep(1000); - - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName); - Thread.sleep(3000); - - // Write 50 aligned rows, each with all 6 data types in a single INSERT - System.out.println(" Writing 50 aligned rows with 6 data types per row"); - try (ISession session = openSession()) { - for (int i = 1; i <= 50; i++) { - session.executeNonQueryStatement( - String.format( - "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," - + " s_double, s_bool, s_text)" - + " VALUES (%d, %d, %d, %f, %f, %s, 'text_%d')", - database, - i, - i, - (long) i * 100000L, - i * 1.1f, - i * 2.2, - i % 2 == 0 ? "true" : "false", - i)); - } - } - Thread.sleep(2000); - - System.out.println(" Polling..."); - PollResult result = pollUntilComplete(consumer, 50, 70); - System.out.println(" Result: " + result); - - assertEquals("Expected exactly 50 aligned rows", 50, result.totalRows); - // Verify we see columns for multiple data types - System.out.println(" Seen columns: " + result.seenColumns); - assertAtLeast( - "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size()); - } finally { - cleanup(consumer, topicName, database); - } - } - - // ============================ - // Test 8: Poll Without Commit (Re-delivery) - // ============================ - /** - * Tests at-least-once delivery with a mixed commit/no-commit pattern. - * - *

Writes 50 rows. The prefetching thread may batch multiple INSERTs into a single event, so we - * track committed ROWS (not events). The state machine alternates: - * - *

    - *
  • Even-numbered rounds: poll WITHOUT commit, record ALL timestamps from the event; next - * poll verifies the EXACT SAME timestamps are re-delivered, then commit. - *
  • Odd-numbered rounds: poll and commit directly; next poll should deliver DIFFERENT data. - *
- * - *

This exercises both the re-delivery path (recycleInFlightEventsForConsumer) and the normal - * commit path in an interleaved fashion. - */ - private static void testPollWithoutCommit() throws Exception { - String database = nextDatabase(); - String topicName = nextTopic(); - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - SubscriptionTreePullConsumer consumer = null; - - try { - try (ISession session = openSession()) { - createDatabase(session, database); - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); - session.executeNonQueryStatement("flush"); - } - Thread.sleep(2000); - - createTopic(topicName, database + ".**"); - Thread.sleep(1000); - - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName); - Thread.sleep(3000); - - // Write 50 rows (may be batched into fewer events by the prefetching thread) - final int totalRows = 50; - System.out.println(" Writing " + totalRows + " rows"); - try (ISession session = openSession()) { - for (int i = 1; i <= totalRows; i++) { - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); - } - } - Thread.sleep(3000); - - // State machine: alternate between skip-commit and direct-commit. - // Track committed ROWS (not events) because batching is unpredictable. - int totalRowsCommitted = 0; - int roundNumber = 0; // counts distinct events seen (used for alternation) - boolean hasPending = false; - List pendingTimestamps = new ArrayList<>(); // timestamps from the uncommitted event - Set allCommittedTimestamps = new HashSet<>(); // all timestamps ever committed - int redeliveryCount = 0; - - for (int attempt = 0; attempt < 200 && totalRowsCommitted < totalRows; attempt++) { - List msgs = consumer.poll(Duration.ofMillis(5000)); - if (msgs.isEmpty()) { - Thread.sleep(1000); - continue; - } - - for (SubscriptionMessage msg : msgs) { - // Extract ALL timestamps from this event (may contain multiple rows) - List currentTimestamps = new ArrayList<>(); - for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { - while (ds.hasNext()) { - currentTimestamps.add(ds.next().getTimestamp()); - } - } - assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0); + for (SubscriptionMessage msg : msgs) { + List currentTimestamps = new ArrayList<>(); + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + currentTimestamps.add(ds.next().getTimestamp()); + } + } + assertTrue("Poll should return data with at least 1 row", currentTimestamps.size() > 0); if (hasPending) { - // === Re-delivery round: verify EXACT same timestamps === + // Re-delivery round: verify EXACT same timestamps assertTrue( "Re-delivery timestamp list mismatch: expected=" + pendingTimestamps @@ -953,8 +877,7 @@ private static void testPollWithoutCommit() throws Exception { + "] Re-delivered & committed: timestamps=" + currentTimestamps); } else { - // === New event round === - // After a commit, verify this is DIFFERENT data (no overlap with committed set) + // New event round if (totalRowsCommitted > 0) { boolean overlap = false; for (Long ts : currentTimestamps) { @@ -964,16 +887,9 @@ private static void testPollWithoutCommit() throws Exception { } } assertTrue( - "After commit, should receive different data (timestamps=" - + currentTimestamps - + " overlap with committed=" - + allCommittedTimestamps - + ")", - !overlap); + "After commit, should receive different data (overlap detected)", !overlap); } - // Even-numbered rounds: skip commit (test re-delivery) - // Odd-numbered rounds: commit directly (test normal flow) if (roundNumber % 2 == 0) { pendingTimestamps = new ArrayList<>(currentTimestamps); hasPending = true; @@ -1021,7 +937,6 @@ private static void testPollWithoutCommit() throws Exception { } } assertEquals("After all committed, should receive no more data", 0, extraRows); - System.out.println( " At-least-once re-delivery verified: " + totalRows @@ -1033,16 +948,22 @@ private static void testPollWithoutCommit() throws Exception { } } - // ============================ - // Test 9: Multi Consumer Group Independent Consumption - // ============================ + // ====================================================================== + // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic) + // ====================================================================== /** - * Two consumer groups subscribe to the same topic. Verifies that each group independently - * receives ALL data (data is not partitioned/split between groups). + * Verifies: + * + *

    + *
  • Two consumer groups on same topic: each group gets ALL data independently + *
  • One consumer subscribes to two topics with different path filters: each topic delivers + * only matching data + *
*/ - private static void testMultiConsumerGroupIndependent() throws Exception { + private static void testMultiEntityIsolation() throws Exception { String database = nextDatabase(); - String topicName = nextTopic(); + String topicName1 = "topic_multi_" + testCounter + "_a"; + String topicName2 = "topic_multi_" + testCounter + "_b"; String consumerGroupId1 = "cg_multi_" + testCounter + "_a"; String consumerId1 = "consumer_multi_" + testCounter + "_a"; String consumerGroupId2 = "cg_multi_" + testCounter + "_b"; @@ -1051,178 +972,231 @@ private static void testMultiConsumerGroupIndependent() throws Exception { SubscriptionTreePullConsumer consumer2 = null; try { - // Create database and initial data + // Setup: database with d1 and d2 try (ISession session = openSession()) { createDatabase(session, database); session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - createTopic(topicName, database + ".**"); + // Topic 1: covers d1 only, Topic 2: covers d2 only + createTopic(topicName1, database + ".d1.**"); + createTopic(topicName2, database + ".d2.**"); Thread.sleep(1000); - // Two consumers in different groups both subscribe to the same topic + // Consumer 1 (group A): subscribes to BOTH topics consumer1 = createConsumer(consumerId1, consumerGroupId1); - consumer1.subscribe(topicName); + consumer1.subscribe(topicName1, topicName2); + // Consumer 2 (group B): subscribes to BOTH topics consumer2 = createConsumer(consumerId2, consumerGroupId2); - consumer2.subscribe(topicName); + consumer2.subscribe(topicName1, topicName2); Thread.sleep(3000); - // Write 50 rows - System.out.println(" Writing 50 rows"); + // Write 30 rows to d1, 40 rows to d2 + System.out.println(" Writing 30 rows to d1, 40 rows to d2"); try (ISession session = openSession()) { - for (int i = 1; i <= 50; i++) { + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } session.executeNonQueryStatement( - String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); } } Thread.sleep(2000); - // Poll from group 1 - System.out.println(" Polling from consumer group 1..."); - PollResult result1 = pollUntilComplete(consumer1, 50, 70); + // Part A: Both groups should get 70 rows independently + System.out.println(" Part A: Multi-group isolation"); + System.out.println(" Polling from group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 70, 80); System.out.println(" Group 1 result: " + result1); - // Poll from group 2 - System.out.println(" Polling from consumer group 2..."); - PollResult result2 = pollUntilComplete(consumer2, 50, 70); + System.out.println(" Polling from group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 70, 80); System.out.println(" Group 2 result: " + result2); - // Both groups should have all 50 rows - assertEquals("Group 1 should receive all 50 rows", 50, result1.totalRows); - assertEquals("Group 2 should receive all 50 rows", 50, result2.totalRows); + assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows); + assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows); + + // Part B: Verify per-topic device isolation + if (!result1.rowsPerDevice.isEmpty()) { + Integer d1Rows = result1.rowsPerDevice.get(database + ".d1"); + Integer d2Rows = result1.rowsPerDevice.get(database + ".d2"); + assertEquals("Expected 30 rows from d1 (topic1)", 30, d1Rows != null ? d1Rows : 0); + assertEquals("Expected 40 rows from d2 (topic2)", 40, d2Rows != null ? d2Rows : 0); + System.out.println(" Multi-topic isolation verified: d1=" + d1Rows + ", d2=" + d2Rows); + } System.out.println( - " Independent consumption verified: group1=" + " Multi-group isolation verified: group1=" + result1.totalRows + ", group2=" + result2.totalRows); } finally { - // Clean up both consumers if (consumer1 != null) { try { - consumer1.unsubscribe(topicName); + consumer1.unsubscribe(topicName1, topicName2); } catch (Exception e) { - // ignore + /* ignore */ } try { consumer1.close(); } catch (Exception e) { - // ignore + /* ignore */ } } if (consumer2 != null) { try { - consumer2.unsubscribe(topicName); + consumer2.unsubscribe(topicName1, topicName2); } catch (Exception e) { - // ignore + /* ignore */ } try { consumer2.close(); } catch (Exception e) { - // ignore + /* ignore */ } } - dropTopic(topicName); + dropTopic(topicName1); + dropTopic(topicName2); deleteDatabase(database); } } - // ============================ - // Test 10: Multi Topic Subscription - // ============================ + // ====================================================================== + // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix) + // ====================================================================== /** - * One consumer subscribes to two different topics with different path filters. Verifies that each - * topic delivers only its matching data, and no cross-contamination occurs. + * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The + * pending queue overflow triggers gaps, which should be recovered from WAL. + * + *

Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one + * {@code pendingEntries.offer()}. A single {@code session.insertTablet(tablet)} with N rows in + * one time partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To + * actually overflow, we need 4096+ individual write() calls arriving faster than the + * prefetch thread can drain. We achieve this with multiple concurrent writer threads, each + * performing individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate. + * + *

Note: Gap occurrence is inherently timing-dependent (race between writers and the + * prefetch drain loop). This test maximizes the probability by using concurrent threads, but + * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling + * from WAL" messages to confirm the gap path was exercised. + * + *

Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to + * the next prefetch iteration. */ - private static void testMultiTopicSubscription() throws Exception { + private static void testBurstWriteGapRecovery() throws Exception { String database = nextDatabase(); - String topicName1 = "topic_multi_" + testCounter + "_a"; - String topicName2 = "topic_multi_" + testCounter + "_b"; + String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); String consumerId = nextConsumerId(); SubscriptionTreePullConsumer consumer = null; try { - // Create database with two device groups try (ISession session = openSession()) { createDatabase(session, database); session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - // Topic 1: covers d1 only - createTopic(topicName1, database + ".d1.**"); - // Topic 2: covers d2 only - createTopic(topicName2, database + ".d2.**"); + createTopic(topicName, database + ".**"); Thread.sleep(1000); consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName1, topicName2); + consumer.subscribe(topicName); Thread.sleep(3000); - // Write 30 rows to d1 and 40 rows to d2 - System.out.println(" Writing 30 rows to d1, 40 rows to d2"); - try (ISession session = openSession()) { - for (int i = 1; i <= 40; i++) { - if (i <= 30) { - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); - } - session.executeNonQueryStatement( - String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); - } + // Use multiple concurrent writer threads with individual SQL INSERTs. + // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer(). + // With N threads writing concurrently, aggregate rate should exceed drain rate + // and overflow the 4096-capacity queue, creating gaps. + final int writerThreads = 4; + final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096 + final int totalRows = writerThreads * rowsPerThread; + final AtomicInteger errorCount = new AtomicInteger(0); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(writerThreads); + + System.out.println( + " Burst writing " + + totalRows + + " rows via " + + writerThreads + + " concurrent threads (" + + rowsPerThread + + " individual SQL INSERTs each)"); + System.out.println( + " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)"); + + ExecutorService executor = Executors.newFixedThreadPool(writerThreads); + for (int t = 0; t < writerThreads; t++) { + final int threadId = t; + final int startTs = threadId * rowsPerThread + 1; + executor.submit( + () -> { + try { + startLatch.await(); // all threads start at the same time + try (ISession session = openSession()) { + for (int i = 0; i < rowsPerThread; i++) { + int ts = startTs + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", + database, ts, (long) ts * 10)); + } + } + } catch (Exception e) { + System.out.println(" Writer thread " + threadId + " error: " + e.getMessage()); + errorCount.incrementAndGet(); + } finally { + doneLatch.countDown(); + } + }); } - Thread.sleep(2000); - // Poll all data — should get d1 rows (via topic1) + d2 rows (via topic2) - System.out.println(" Polling (expecting 30 from d1 + 40 from d2 = 70 total)..."); - PollResult result = pollUntilComplete(consumer, 70, 80); - System.out.println(" Result: " + result); + // Fire all threads simultaneously + startLatch.countDown(); + doneLatch.await(); + executor.shutdown(); - assertEquals("Expected exactly 70 rows total (30 d1 + 40 d2)", 70, result.totalRows); - if (!result.rowsPerDevice.isEmpty()) { - Integer d1Rows = result.rowsPerDevice.get(database + ".d1"); - Integer d2Rows = result.rowsPerDevice.get(database + ".d2"); - assertEquals("Expected 30 rows from d1", 30, d1Rows != null ? d1Rows : 0); - assertEquals("Expected 40 rows from d2", 40, d2Rows != null ? d2Rows : 0); - System.out.println( - " Multi-topic isolation verified: d1=" + d1Rows + " rows, d2=" + d2Rows + " rows"); + if (errorCount.get() > 0) { + System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors"); } + + // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes + System.out.println( + " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)..."); + System.out.println( + " (Check server logs for 'gap detected' to confirm gap recovery was triggered)"); + PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true); + System.out.println(" Result: " + result); + + assertEquals( + "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)", + totalRows, + result.totalRows); } finally { - // Clean up consumer, both topics, and database - if (consumer != null) { - try { - consumer.unsubscribe(topicName1, topicName2); - } catch (Exception e) { - // ignore - } - try { - consumer.close(); - } catch (Exception e) { - // ignore - } - } - dropTopic(topicName1); - dropTopic(topicName2); - deleteDatabase(database); + cleanup(consumer, topicName, database); } } - // ============================ - // Test 11: Flush Data Delivery - // ============================ + // ====================================================================== + // Test 8: Commit After Unsubscribe (NEW — tests H7 fix) + // ====================================================================== /** - * Subscribes first, then writes data and flushes before polling. Verifies that flushing (memtable - * → TSFile) does not cause data loss in the subscription pipeline, because WAL pinning keeps - * entries available until committed by the subscription consumer. + * Tests that commit still works correctly after the consumer has unsubscribed (queue has been + * torn down). The commit routing should use metadata-based topic config check instead of runtime + * queue state. + * + *

Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue(). */ - private static void testFlushDataDelivery() throws Exception { + private static void testCommitAfterUnsubscribe() throws Exception { String database = nextDatabase(); String topicName = nextTopic(); String consumerGroupId = nextConsumerGroup(); @@ -1245,196 +1219,76 @@ private static void testFlushDataDelivery() throws Exception { consumer.subscribe(topicName); Thread.sleep(3000); - // Write 50 rows, then flush before polling - System.out.println(" Writing 50 rows then flushing"); + // Write data + System.out.println(" Writing 50 rows"); try (ISession session = openSession()) { for (int i = 1; i <= 50; i++) { session.executeNonQueryStatement( String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); } - System.out.println(" Flushing..."); - session.executeNonQueryStatement("flush"); } Thread.sleep(2000); - // Poll — all 50 rows should be delivered despite flush - System.out.println(" Polling after flush..."); - PollResult result = pollUntilComplete(consumer, 50, 70); - System.out.println(" Result: " + result); - assertEquals("Expected exactly 50 rows after flush (no data loss)", 50, result.totalRows); - } finally { - cleanup(consumer, topicName, database); - } - } - - // ============================ - // Test 12: Cross-Partition Aligned Timeseries (Multiple Write Methods) - // ============================ - /** - * Tests cross-partition aligned timeseries with 6 data types, written via six different aligned - * methods. Timestamps are spaced >1 week apart to force different time partitions, exercising the - * WAL merge path for multi-partition inserts. - * - *

Write methods (all aligned): - * - *

    - *
  1. SQL single row - *
  2. SQL multi-row (cross-partition) - *
  3. session.insertAlignedRecord (single row) - *
  4. session.insertAlignedRecordsOfOneDevice (cross-partition) - *
  5. session.insertAlignedTablet (cross-partition) - *
  6. session.insertAlignedTablets (cross-partition) - *
- */ - private static void testCrossPartitionAligned() throws Exception { - String database = nextDatabase(); - String topicName = nextTopic(); - String consumerGroupId = nextConsumerGroup(); - String consumerId = nextConsumerId(); - SubscriptionTreePullConsumer consumer = null; - - // Gap slightly over 1 week (default partition interval = 604,800,000ms) - final long GAP = 604_800_001L; - final String device = database + ".d_aligned"; - - try { - // Create aligned timeseries with 6 data types - try (ISession session = openSession()) { - createDatabase(session, database); - session.executeNonQueryStatement( - String.format( - "CREATE ALIGNED TIMESERIES %s.d_aligned" - + "(s_int32 INT32, s_int64 INT64, s_float FLOAT," - + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", - database)); - // Init row to force DataRegion creation - session.executeNonQueryStatement( - String.format( - "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," - + " s_double, s_bool, s_text)" - + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')", - database)); - session.executeNonQueryStatement("flush"); + // Poll WITHOUT commit + System.out.println(" Polling WITHOUT commit..."); + List uncommittedMessages = new ArrayList<>(); + int polledRows = 0; + for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + if (polledRows > 0) break; + Thread.sleep(500); + continue; + } + for (SubscriptionMessage msg : msgs) { + uncommittedMessages.add(msg); + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + polledRows++; + } + } + } } + System.out.println( + " Polled " + + polledRows + + " rows, holding " + + uncommittedMessages.size() + + " uncommitted messages"); + assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows); + + // Unsubscribe (tears down the consensus queue) + System.out.println(" Unsubscribing (queue teardown)..."); + consumer.unsubscribe(topicName); Thread.sleep(2000); - createTopic(topicName, database + ".**"); - Thread.sleep(1000); - - consumer = createConsumer(consumerId, consumerGroupId); - consumer.subscribe(topicName); - Thread.sleep(3000); - - // Shared measurement info for Session API calls - List measurements = - Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text"); - List types = - Arrays.asList( - TSDataType.INT32, - TSDataType.INT64, - TSDataType.FLOAT, - TSDataType.DOUBLE, - TSDataType.BOOLEAN, - TSDataType.TEXT); - - // Shared schema for Tablet API calls - List schemas = new ArrayList<>(); - schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32)); - schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64)); - schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); - schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); - schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); - schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT)); - - System.out.println(" Writing cross-partition aligned data via 6 methods"); - int totalExpected = 0; - - try (ISession session = openSession()) { - - // --- Method 1: SQL single row --- - long t1 = 1; - session.executeNonQueryStatement( - String.format( - "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," - + " s_double, s_bool, s_text)" - + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')", - database, t1)); - totalExpected += 1; - System.out.println(" Method 1 (SQL single row): 1 row"); - - // --- Method 2: SQL multi-row (cross-partition, 2 rows >1 week apart) --- - long t2a = 1 + GAP; - long t2b = 1 + 2 * GAP; - session.executeNonQueryStatement( - String.format( - "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," - + " s_double, s_bool, s_text)" - + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a')," - + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')", - database, t2a, t2b)); - totalExpected += 2; - System.out.println(" Method 2 (SQL multi-row, cross-partition): 2 rows"); - - // --- Method 3: insertAlignedRecord (single row) --- - long t3 = 1 + 3 * GAP; - List values3 = Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single"); - session.insertAlignedRecord(device, t3, measurements, types, values3); - totalExpected += 1; - System.out.println(" Method 3 (insertAlignedRecord): 1 row"); - - // --- Method 4: insertAlignedRecordsOfOneDevice (cross-partition, 2 rows) --- - long t4a = 1 + 4 * GAP; - long t4b = 1 + 5 * GAP; - session.insertAlignedRecordsOfOneDevice( - device, - Arrays.asList(t4a, t4b), - Arrays.asList(measurements, measurements), - Arrays.asList(types, types), - Arrays.asList( - Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"), - Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b"))); - totalExpected += 2; - System.out.println( - " Method 4 (insertAlignedRecordsOfOneDevice, cross-partition): 2 rows"); - - // --- Method 5: insertAlignedTablet (cross-partition, 2 rows) --- - long t5a = 1 + 6 * GAP; - long t5b = 1 + 7 * GAP; - Tablet tablet5 = new Tablet(device, schemas, 2); - addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a"); - addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b"); - session.insertAlignedTablet(tablet5); - totalExpected += 2; - System.out.println(" Method 5 (insertAlignedTablet, cross-partition): 2 rows"); - - // --- Method 6: insertAlignedTablets (cross-partition, 2 rows) --- - long t6a = 1 + 8 * GAP; - long t6b = 1 + 9 * GAP; - Tablet tablet6 = new Tablet(device, schemas, 2); - addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a"); - addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b"); - Map tabletMap = new HashMap<>(); - tabletMap.put(device, tablet6); - session.insertAlignedTablets(tabletMap); - totalExpected += 2; - System.out.println(" Method 6 (insertAlignedTablets, cross-partition): 2 rows"); + // Now commit the previously polled messages — should NOT throw + System.out.println( + " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe..."); + boolean commitSucceeded = true; + for (SubscriptionMessage msg : uncommittedMessages) { + try { + consumer.commitSync(msg); + } catch (Exception e) { + System.out.println(" Commit threw exception: " + e.getMessage()); + commitSucceeded = false; + } } - System.out.println(" Total expected rows: " + totalExpected); - Thread.sleep(2000); - - System.out.println(" Polling..."); - PollResult result = pollUntilComplete(consumer, totalExpected, 100); - System.out.println(" Result: " + result); - - assertEquals( - "Expected exactly " + totalExpected + " cross-partition aligned rows", - totalExpected, - result.totalRows); - assertAtLeast( - "Expected at least 6 columns (one per data type)", 6, result.seenColumns.size()); + // The commit may silently succeed or fail gracefully — the key is no crash + System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded); + System.out.println(" (Key: no exception crash, routing handled gracefully)"); } finally { - cleanup(consumer, topicName, database); + if (consumer != null) { + try { + consumer.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopic(topicName); + deleteDatabase(database); } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java index c494ae05d01b0..8cb168272b295 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java @@ -82,6 +82,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.stream.Collectors; public class IoTConsensus implements IConsensus { @@ -105,6 +106,12 @@ public class IoTConsensus implements IConsensus { */ public static volatile BiConsumer onNewPeerCreated; + /** + * Optional callback invoked before a local peer is deleted via {@link #deleteLocalPeer}. Used by + * the subscription system to unbind and clean up prefetching queues before the region is removed. + */ + public static volatile Consumer onPeerRemoved; + private final IClientManager clientManager; private final IClientManager syncClientManager; private final ScheduledExecutorService backgroundTaskService; @@ -321,6 +328,18 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers) @Override public void deleteLocalPeer(ConsensusGroupId groupId) throws ConsensusException { KillPoint.setKillPoint(IoTConsensusDeleteLocalPeerKillPoints.BEFORE_DELETE); + + // Notify subscription system before stopping the peer, so that subscription queues can + // properly unregister from the still-alive serverImpl. + final Consumer removeCallback = onPeerRemoved; + if (removeCallback != null) { + try { + removeCallback.accept(groupId); + } catch (final Exception e) { + logger.warn("onPeerRemoved callback failed for group {}", groupId, e); + } + } + AtomicBoolean exist = new AtomicBoolean(false); stateMachineMap.computeIfPresent( groupId, diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java index bb5d4aa603417..37222c47d35ff 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java @@ -968,7 +968,7 @@ void checkAndUpdateIndex() { * If there is only one replica, set it to Long.MAX_VALUE. If there are multiple replicas, get the * latest SafelyDeletedSearchIndex again. This enables wal to be deleted in a timely manner. */ - void checkAndUpdateSafeDeletedSearchIndex() { + public void checkAndUpdateSafeDeletedSearchIndex() { if (configuration.isEmpty()) { logger.error( "Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time."); diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java index 374691bf38bf1..51704a24c74a5 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java @@ -167,15 +167,16 @@ public synchronized OptionalLong getMinFlushedSyncIndex() { return threads.stream().mapToLong(LogDispatcherThread::getLastFlushedSyncIndex).min(); } - public void checkAndFlushIndex() { + public synchronized void checkAndFlushIndex() { if (!threads.isEmpty()) { threads.forEach( thread -> { IndexController controller = thread.getController(); controller.update(controller.getCurrentIndex(), true); }); - // do not set SafelyDeletedSearchIndex as it is Long.MAX_VALUE when replica is 1 - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); + // Use subscription-aware safe-delete to avoid deleting WAL entries + // still needed by subscription consumers. + impl.checkAndUpdateSafeDeletedSearchIndex(); } } @@ -397,8 +398,9 @@ public void updateSafelyDeletedSearchIndex() { // indicating that insert nodes whose search index are before this value can be deleted // safely. // - // Use minFlushedSyncIndex here to reserve the WAL which are not flushed and support kill -9. - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); + // Use subscription-aware safe-delete to avoid deleting WAL entries + // still needed by subscription consumers. + impl.checkAndUpdateSafeDeletedSearchIndex(); // notify if (impl.unblockWrite()) { impl.signal(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java index 220ad3e449951..abf9161962bff 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java @@ -24,6 +24,7 @@ import org.apache.iotdb.db.subscription.broker.SubscriptionBroker; import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager; import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask; @@ -188,7 +189,8 @@ public List commit( final List consensusContexts = new ArrayList<>(); for (final SubscriptionCommitContext ctx : commitContexts) { final String topicName = ctx.getTopicName(); - if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + if (Objects.nonNull(consensusBroker) + && ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { consensusContexts.add(ctx); } else { pipeContexts.add(ctx); @@ -370,6 +372,20 @@ public void unbindConsensusPrefetchingQueue( prefetchingQueueCount.invalidate(); } + public void unbindByRegion(final String regionId) { + int totalClosed = 0; + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + totalClosed += broker.unbindByRegion(regionId); + } + if (totalClosed > 0) { + prefetchingQueueCount.invalidate(); + LOGGER.info( + "Subscription: unbound {} consensus prefetching queue(s) for removed region [{}]", + totalClosed, + regionId); + } + } + public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); if (Objects.isNull(pipeBroker)) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java index 84d89ef9a8f39..1c567965d911b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java @@ -32,6 +32,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; @@ -124,12 +125,12 @@ public List poll( eventsToPoll.add(event); totalSize += currentSize; - if (totalSize + currentSize > maxBytes) { + if (totalSize >= maxBytes) { break; } } - if (totalSize > maxBytes) { + if (totalSize >= maxBytes) { break; } } @@ -353,6 +354,30 @@ public void unbindConsensusPrefetchingQueue(final String topicName) { brokerId); } + public int unbindByRegion(final String regionId) { + int closedCount = 0; + for (final Map.Entry> entry : + topicNameToConsensusPrefetchingQueues.entrySet()) { + final List queues = entry.getValue(); + final Iterator iterator = queues.iterator(); + while (iterator.hasNext()) { + final ConsensusPrefetchingQueue q = iterator.next(); + if (regionId.equals(q.getConsensusGroupId())) { + q.close(); + iterator.remove(); + closedCount++; + LOGGER.info( + "Subscription: closed consensus prefetching queue for topic [{}] region [{}] " + + "in consumer group [{}] due to region removal", + entry.getKey(), + regionId, + brokerId); + } + } + } + return closedCount; + } + @Override public void removeQueue(final String topicName) { final List queues = diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java index fbde6cee8c2fe..9d3f2b283c556 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java @@ -43,6 +43,7 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Objects; @@ -190,37 +191,31 @@ private List convertInsertTabletNode(final InsertTabletNode node) { return Collections.emptyList(); } - // Build Tablet with all rows final int columnCount = matchedColumnIndices.size(); + final boolean allColumnsMatch = (columnCount == measurements.length); + + // Build schemas (always needed) final List schemas = new ArrayList<>(columnCount); for (final int colIdx : matchedColumnIndices) { schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); } - final Tablet tablet = new Tablet(deviceId.toString(), schemas, rowCount); - - for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) { - tablet.addTimestamp(rowIdx, times[rowIdx]); - - for (int colIdx = 0; colIdx < columnCount; colIdx++) { - final int originalColIdx = matchedColumnIndices.get(colIdx); - final boolean isNull = - (bitMaps != null - && bitMaps[originalColIdx] != null - && bitMaps[originalColIdx].isMarked(rowIdx)); - - if (isNull) { - if (tablet.getBitMaps() == null) { - tablet.initBitMaps(); - } - tablet.getBitMaps()[colIdx].mark(rowIdx); - } else { - copyColumnValue( - tablet, rowIdx, colIdx, dataTypes[originalColIdx], columns[originalColIdx], rowIdx); - } + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = allColumnsMatch ? i : matchedColumnIndices.get(i); + newColumns[i] = copyColumnArray(dataTypes[originalColIdx], columns[originalColIdx], rowCount); + if (bitMaps != null && bitMaps[originalColIdx] != null) { + newBitMaps[i] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[originalColIdx], 0, newBitMaps[i], 0, rowCount); } } - tablet.setRowSize(rowCount); + + final Tablet tablet = + new Tablet(deviceId.toString(), schemas, newTimes, newColumns, newBitMaps, rowCount); return Collections.singletonList(tablet); } @@ -327,26 +322,27 @@ private List convertRelationalInsertTabletNode(final RelationalInsertTab schemas.add(new MeasurementSchema(measurements[i], dataTypes[i])); } - final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, rowCount); - - for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) { - tablet.addTimestamp(rowIdx, times[rowIdx]); + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; - for (int colIdx = 0; colIdx < columnCount; colIdx++) { - final boolean isNull = - (bitMaps != null && bitMaps[colIdx] != null && bitMaps[colIdx].isMarked(rowIdx)); - - if (isNull) { - if (tablet.getBitMaps() == null) { - tablet.initBitMaps(); - } - tablet.getBitMaps()[colIdx].mark(rowIdx); - } else { - copyColumnValue(tablet, rowIdx, colIdx, dataTypes[colIdx], columns[colIdx], rowIdx); - } + for (int colIdx = 0; colIdx < columnCount; colIdx++) { + newColumns[colIdx] = copyColumnArray(dataTypes[colIdx], columns[colIdx], rowCount); + if (bitMaps != null && bitMaps[colIdx] != null) { + newBitMaps[colIdx] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[colIdx], 0, newBitMaps[colIdx], 0, rowCount); } } - tablet.setRowSize(rowCount); + + final Tablet tablet = + new Tablet( + tableName != null ? tableName : "", + schemas, + newTimes, + newColumns, + newBitMaps, + rowCount); return Collections.singletonList(tablet); } @@ -387,6 +383,65 @@ private List getMatchedTreeColumnIndices( return matchedIndices; } + /** + * Bulk-copies a typed column array using System.arraycopy. Returns a new array of the same type + * containing the first {@code rowCount} elements. + */ + private Object copyColumnArray( + final TSDataType dataType, final Object sourceColumn, final int rowCount) { + switch (dataType) { + case BOOLEAN: + { + final boolean[] src = (boolean[]) sourceColumn; + final boolean[] dst = new boolean[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT32: + case DATE: + { + final int[] src = (int[]) sourceColumn; + final int[] dst = new int[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT64: + case TIMESTAMP: + { + final long[] src = (long[]) sourceColumn; + final long[] dst = new long[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case FLOAT: + { + final float[] src = (float[]) sourceColumn; + final float[] dst = new float[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case DOUBLE: + { + final double[] src = (double[]) sourceColumn; + final double[] dst = new double[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case TEXT: + case BLOB: + case STRING: + { + final Binary[] src = (Binary[]) sourceColumn; + final Binary[] dst = new Binary[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + default: + LOGGER.warn("Unsupported data type for bulk copy: {}", dataType); + return sourceColumn; + } + } + /** * Adds a single value to the tablet at the specified position. * diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java index 28743d1aae73c..8b5c2cf25a8e5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java @@ -32,6 +32,7 @@ import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode; import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; @@ -154,6 +155,11 @@ public class ConsensusPrefetchingQueue { private static final int MAX_PREFETCHING_QUEUE_SIZE = 256; + private static final long WAL_RETENTION_WARN_THRESHOLD = 100_000; + + /** Counter of WAL gap entries that could not be filled (data loss). */ + private final AtomicLong walGapSkippedEntries = new AtomicLong(0); + private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true); private volatile boolean isClosed = false; @@ -215,12 +221,27 @@ public ConsensusPrefetchingQueue( /** * Returns the earliest outstanding (uncommitted) search index for WAL pinning. If there are no * outstanding events, returns the next expected search index (nothing to pin beyond what we've - * already processed). + * already processed). Also monitors WAL retention gap for slow consumer detection. */ private long getEarliestOutstandingSearchIndex() { final Map.Entry first = outstandingCommitIdToStartIndex.firstEntry(); if (first != null) { - return first.getValue(); + final long earliestIndex = first.getValue(); + // WAL retention health check: warn if outstanding gap grows too large + final long currentIndex = nextExpectedSearchIndex.get(); + final long retentionGap = currentIndex - earliestIndex; + if (retentionGap > WAL_RETENTION_WARN_THRESHOLD) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: WAL retention gap is {} entries " + + "(earliest outstanding={}, current={}). " + + "A slow or stalled consumer is pinning WAL files and may cause disk exhaustion. " + + "Consider committing events or increasing consumer throughput.", + this, + retentionGap, + earliestIndex, + currentIndex); + } + return earliestIndex; } return nextExpectedSearchIndex.get(); } @@ -429,11 +450,11 @@ private void prefetchLoop() { t.getClass().getName(), t.getMessage(), t); - if (t instanceof Error) { + if (t instanceof VirtualMachineError) { LOGGER.error( - "ConsensusPrefetchingQueue {}: caught Error in prefetch loop, " - + "will attempt to continue", - this); + "ConsensusPrefetchingQueue {}: caught VirtualMachineError, stopping thread", this); + markClosed(); + break; } try { Thread.sleep(100); @@ -478,7 +499,24 @@ private void processBatchFromPending(final List batch) expected, searchIndex, searchIndex - expected); - fillGapFromWAL(expected, searchIndex, batchedTablets); + final long gapMaxIndex = fillGapFromWAL(expected, searchIndex, batchedTablets); + if (gapMaxIndex > batchEndSearchIndex) { + batchEndSearchIndex = gapMaxIndex; + } + + // If gap was not fully filled (e.g., WAL timeout), do NOT skip the gap. + // Break and defer remaining entries to the next prefetch loop iteration. + // WAL pin ensures the missing entries won't be deleted. + if (nextExpectedSearchIndex.get() < searchIndex) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: gap [{}, {}) not fully filled (reached {}). " + + "Deferring remaining batch to next prefetch iteration.", + this, + expected, + searchIndex, + nextExpectedSearchIndex.get()); + break; + } } if (searchIndex < nextExpectedSearchIndex.get()) { @@ -555,11 +593,14 @@ private void processBatchFromPending(final List batch) /** * Fills a gap in the pending queue by reading entries from WAL. Called when gap is detected * between nextExpectedSearchIndex and an incoming entry's searchIndex. + * + * @return the maximum searchIndex processed during gap filling, or -1 if no entries processed */ - private void fillGapFromWAL( + private long fillGapFromWAL( final long fromIndex, final long toIndex, final List batchedTablets) { // Re-position WAL reader to the gap start reqIterator = consensusReqReader.getReqIterator(fromIndex); + long maxProcessedIndex = -1; while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) { try { @@ -575,6 +616,9 @@ private void fillGapFromWAL( batchedTablets.addAll(tablets); } nextExpectedSearchIndex.set(walIndex + 1); + if (walIndex > maxProcessedIndex) { + maxProcessedIndex = walIndex; + } } catch (final Exception e) { LOGGER.warn( "ConsensusPrefetchingQueue {}: error filling gap from WAL at index {}", @@ -601,6 +645,9 @@ private void fillGapFromWAL( batchedTablets.addAll(tablets); } nextExpectedSearchIndex.set(walIndex + 1); + if (walIndex > maxProcessedIndex) { + maxProcessedIndex = walIndex; + } } } catch (final InterruptedException e) { Thread.currentThread().interrupt(); @@ -612,6 +659,24 @@ private void fillGapFromWAL( toIndex); } } + + // If the gap still cannot be fully filled (WAL truncated/deleted), skip ahead to avoid + // blocking consumption indefinitely. This results in data loss for the skipped range. + if (nextExpectedSearchIndex.get() < toIndex) { + final long skipped = toIndex - nextExpectedSearchIndex.get(); + walGapSkippedEntries.addAndGet(skipped); + LOGGER.error( + "ConsensusPrefetchingQueue {}: WAL gap [{}, {}) cannot be filled - {} entries lost. " + + "Total skipped entries so far: {}. This indicates WAL truncation or deletion.", + this, + nextExpectedSearchIndex.get(), + toIndex, + skipped, + walGapSkippedEntries.get()); + nextExpectedSearchIndex.set(toIndex); + } + + return maxProcessedIndex; } /** @@ -623,8 +688,24 @@ private void tryCatchUpFromWAL() { syncReqIteratorPosition(); if (!reqIterator.hasNext()) { - // No data on disk either - nothing to do - return; + // The WAL iterator excludes the current-writing WAL file for concurrency safety. + // If entries exist in WAL but are all in the current file (e.g., after pending queue + // overflow), we need to trigger a WAL file roll to make them readable. + final long currentWALIndex = consensusReqReader.getCurrentSearchIndex(); + if (nextExpectedSearchIndex.get() <= currentWALIndex + && consensusReqReader instanceof WALNode) { + LOGGER.info( + "ConsensusPrefetchingQueue {}: subscription behind (at {} vs WAL {}), " + + "triggering WAL file roll to make entries readable", + this, + nextExpectedSearchIndex.get(), + currentWALIndex); + ((WALNode) consensusReqReader).rollWALFile(); + syncReqIteratorPosition(); + } + if (!reqIterator.hasNext()) { + return; + } } final List batchedTablets = new ArrayList<>(); @@ -1063,6 +1144,8 @@ public void cleanUp() { inFlightEvents.values().forEach(event -> event.cleanUp(true)); inFlightEvents.clear(); + + outstandingCommitIdToStartIndex.clear(); } finally { releaseWriteLock(); } @@ -1077,11 +1160,19 @@ public void close() { } catch (final InterruptedException e) { Thread.currentThread().interrupt(); } - // Unregister from IoTConsensusServerImpl (stop receiving in-memory data, unpin WAL). - serverImpl.unregisterSubscriptionQueue(pendingEntries, walPinSupplier); - cleanUp(); - // Persist progress before closing - commitManager.persistAll(); + try { + // Unregister from IoTConsensusServerImpl (stop receiving in-memory data, unpin WAL). + serverImpl.unregisterSubscriptionQueue(pendingEntries, walPinSupplier); + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error during unregister", this, e); + } finally { + try { + cleanUp(); + } finally { + // Persist progress before closing + commitManager.persistAll(); + } + } } private SubscriptionEvent generateErrorResponse(final String errorMessage) { @@ -1168,6 +1259,7 @@ public Map coreReportMessage() { result.put("outstandingEventsSize", String.valueOf(outstandingCommitIdToStartIndex.size())); result.put("pendingEntriesSize", String.valueOf(pendingEntries.size())); result.put("commitIdGenerator", commitIdGenerator.toString()); + result.put("walGapSkippedEntries", String.valueOf(walGapSkippedEntries.get())); result.put("isClosed", String.valueOf(isClosed)); return result; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java index 4096394ad6a33..91883c94b1e11 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java @@ -203,7 +203,7 @@ public void removeState( * @param topicName the topic name */ public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) { - final String prefix = consumerGroupId + "_" + topicName + "_"; + final String prefix = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR; final Iterator> it = commitStates.entrySet().iterator(); while (it.hasNext()) { @@ -228,9 +228,13 @@ public void persistAll() { // ======================== Helper Methods ======================== + // Use a separator that cannot appear in consumerGroupId, topicName, or regionId + // to prevent key collisions (e.g., "a_b" + "c" vs "a" + "b_c"). + private static final String KEY_SEPARATOR = "##"; + private String generateKey( final String consumerGroupId, final String topicName, final String regionId) { - return consumerGroupId + "_" + topicName + "_" + regionId; + return consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionId; } private File getProgressFile(final String key) { @@ -329,8 +333,8 @@ public long getCommittedSearchIndex() { private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000; public void recordMapping(final long commitId, final long searchIndex) { - commitIdToSearchIndex.put(commitId, searchIndex); synchronized (this) { + commitIdToSearchIndex.put(commitId, searchIndex); outstandingSearchIndices.add(searchIndex); final int size = outstandingSearchIndices.size(); if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) { @@ -358,16 +362,21 @@ public void recordMapping(final long commitId, final long searchIndex) { * @return true if successfully committed */ public boolean commit(final long commitId) { - final Long searchIndex = commitIdToSearchIndex.remove(commitId); - if (searchIndex == null) { - LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId); - return false; - } - progress.incrementCommitIndex(); - // Advance committed search index contiguously (gap-aware) + // Advance committed search index contiguously (gap-aware). + // Both remove from commitIdToSearchIndex and outstandingSearchIndices must be + // inside the same synchronized block to prevent a race with recordMapping(): + // recordMapping: put(commitId, si) -> add(si) + // commit: remove(commitId) -> remove(si) + // Without atomicity, commit could remove from map between put and add, + // leaving si permanently in outstandingSearchIndices (WAL leak). synchronized (this) { + final Long searchIndex = commitIdToSearchIndex.remove(commitId); + if (searchIndex == null) { + LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId); + return false; + } outstandingSearchIndices.remove(searchIndex); if (searchIndex > maxCommittedSearchIndex) { maxCommittedSearchIndex = searchIndex; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java index b138dbceef1a2..a36b9e29fe7ed 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java @@ -61,16 +61,20 @@ private ConsensusSubscriptionSetupHandler() { } /** - * Ensures that the IoTConsensus new-peer callback is set, so that when a new DataRegion is - * created, all active consensus subscriptions are automatically bound to the new region. + * Ensures that the IoTConsensus new-peer and peer-removed callbacks are set, so that when a new + * DataRegion is created, all active consensus subscriptions are automatically bound to the new + * region, and when a DataRegion is removed, all subscription queues are properly cleaned up. */ public static void ensureNewRegionListenerRegistered() { - if (IoTConsensus.onNewPeerCreated != null) { - return; + if (IoTConsensus.onNewPeerCreated == null) { + IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated; + LOGGER.info( + "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding"); + } + if (IoTConsensus.onPeerRemoved == null) { + IoTConsensus.onPeerRemoved = ConsensusSubscriptionSetupHandler::onRegionRemoved; + LOGGER.info("Set IoTConsensus.onPeerRemoved callback for consensus subscription cleanup"); } - IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated; - LOGGER.info( - "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding"); } /** @@ -93,14 +97,13 @@ private static void onNewRegionCreated( final ConsensusSubscriptionCommitManager commitManager = ConsensusSubscriptionCommitManager.getInstance(); - final long startSearchIndex = serverImpl.getSearchIndex() + 1; LOGGER.info( "New DataRegion {} created, checking {} consumer group(s) for auto-binding, " - + "startSearchIndex={}", + + "currentSearchIndex={}", groupId, allSubscriptions.size(), - startSearchIndex); + serverImpl.getSearchIndex()); for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) { final String consumerGroupId = groupEntry.getKey(); @@ -141,12 +144,22 @@ private static void onNewRegionCreated( final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail + // for brand-new regions that have no prior subscription progress. + final long persistedIndex = + commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString()); + final long startSearchIndex = + (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1; + LOGGER.info( - "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} (database={})", + "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} " + + "(database={}, startSearchIndex={}, persistedIndex={})", topicName, consumerGroupId, groupId, - dbTableModel); + dbTableModel, + startSearchIndex, + persistedIndex); SubscriptionAgent.broker() .bindConsensusPrefetchingQueue( @@ -169,6 +182,26 @@ private static void onNewRegionCreated( } } + /** + * Callback invoked before a DataRegion (IoTConsensusServerImpl) is deleted locally. Unbinds and + * cleans up all subscription prefetching queues associated with the removed region across all + * consumer groups. + */ + private static void onRegionRemoved(final ConsensusGroupId groupId) { + if (!(groupId instanceof DataRegionId)) { + return; + } + final String regionIdStr = groupId.toString(); + LOGGER.info( + "DataRegion {} being removed, unbinding all consensus subscription queues", regionIdStr); + try { + SubscriptionAgent.broker().unbindByRegion(regionIdStr); + } catch (final Exception e) { + LOGGER.error( + "Failed to unbind consensus subscription queues for removed region {}", regionIdStr, e); + } + } + public static boolean isConsensusBasedTopic(final String topicName) { try { final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName); @@ -316,16 +349,23 @@ private static void setupConsensusQueueForTopic( final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); - final long startSearchIndex = serverImpl.getSearchIndex() + 1; + // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail + // for brand-new regions that have no prior subscription progress. + final long persistedIndex = + commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString()); + final long startSearchIndex = + (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1; LOGGER.info( "Binding consensus prefetching queue for topic [{}] in consumer group [{}] " - + "to data region consensus group [{}] (database={}), startSearchIndex={}", + + "to data region consensus group [{}] (database={}, startSearchIndex={}, " + + "persistedIndex={})", topicName, consumerGroupId, groupId, dbTableModel, - startSearchIndex); + startSearchIndex, + persistedIndex); SubscriptionAgent.broker() .bindConsensusPrefetchingQueue( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java index 0bd526e8dbaa0..9e45f8a160127 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java @@ -25,6 +25,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; /** * Tracks consensus subscription consumption progress for a single (consumerGroup, topic, region) @@ -42,42 +43,42 @@ */ public class SubscriptionConsensusProgress { - private long searchIndex; + private final AtomicLong searchIndex; - private long commitIndex; + private final AtomicLong commitIndex; public SubscriptionConsensusProgress() { this(0L, 0L); } public SubscriptionConsensusProgress(final long searchIndex, final long commitIndex) { - this.searchIndex = searchIndex; - this.commitIndex = commitIndex; + this.searchIndex = new AtomicLong(searchIndex); + this.commitIndex = new AtomicLong(commitIndex); } public long getSearchIndex() { - return searchIndex; + return searchIndex.get(); } public void setSearchIndex(final long searchIndex) { - this.searchIndex = searchIndex; + this.searchIndex.set(searchIndex); } public long getCommitIndex() { - return commitIndex; + return commitIndex.get(); } public void setCommitIndex(final long commitIndex) { - this.commitIndex = commitIndex; + this.commitIndex.set(commitIndex); } public void incrementCommitIndex() { - this.commitIndex++; + this.commitIndex.incrementAndGet(); } public void serialize(final DataOutputStream stream) throws IOException { - ReadWriteIOUtils.write(searchIndex, stream); - ReadWriteIOUtils.write(commitIndex, stream); + ReadWriteIOUtils.write(searchIndex.get(), stream); + ReadWriteIOUtils.write(commitIndex.get(), stream); } public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) { @@ -95,21 +96,22 @@ public boolean equals(final Object o) { return false; } final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o; - return searchIndex == that.searchIndex && commitIndex == that.commitIndex; + return searchIndex.get() == that.searchIndex.get() + && commitIndex.get() == that.commitIndex.get(); } @Override public int hashCode() { - return Objects.hash(searchIndex, commitIndex); + return Objects.hash(searchIndex.get(), commitIndex.get()); } @Override public String toString() { return "SubscriptionConsensusProgress{" + "searchIndex=" - + searchIndex + + searchIndex.get() + ", commitIndex=" - + commitIndex + + commitIndex.get() + '}'; } }