diff --git a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java index 2dfc9d96540c..ce533d014584 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java @@ -114,6 +114,18 @@ public enum ControllerGauge implements AbstractMetrics.Gauge { // Percentage of segments we failed to get size for TABLE_STORAGE_EST_MISSING_SEGMENT_PERCENT("TableStorageEstMissingSegmentPercent", false), + // Forward index compression ratio scaled by 100 (e.g., 4.5x ratio → 450). Divide by 100 to get actual ratio. + TABLE_COMPRESSION_RATIO_PERCENT("TableCompressionRatioPercent", false), + + // Raw (uncompressed) forward index size per replica + TABLE_RAW_FORWARD_INDEX_SIZE_PER_REPLICA("TableRawForwardIndexSizePerReplica", false), + + // Compressed forward index size per replica + TABLE_COMPRESSED_FORWARD_INDEX_SIZE_PER_REPLICA("TableCompressedForwardIndexSizePerReplica", false), + + // Size per replica broken down by storage tier + TABLE_TIERED_STORAGE_SIZE("TableTieredStorageSize", false), + // Number of scheduled Cron jobs CRON_SCHEDULER_JOB_SCHEDULED("cronSchedulerJobScheduled", false), diff --git a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/ColumnCompressionStatsInfo.java b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/ColumnCompressionStatsInfo.java new file mode 100644 index 000000000000..022fa4ec9a5c --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/ColumnCompressionStatsInfo.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.List; +import javax.annotation.Nullable; + + +/** + * Per-column forward index compression statistics. + * + *

Contains the column name, uncompressed and compressed sizes, compression ratio, codec, + * whether the column has a dictionary, and the list of indexes present on the column. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class ColumnCompressionStatsInfo { + private final String _column; + private final long _uncompressedSizeInBytes; + private final long _compressedSizeInBytes; + private final double _compressionRatio; + private final String _codec; + private final boolean _hasDictionary; + private final List _indexes; + + @JsonCreator + public ColumnCompressionStatsInfo( + @JsonProperty("column") String column, + @JsonProperty("uncompressedSizeInBytes") long uncompressedSizeInBytes, + @JsonProperty("compressedSizeInBytes") long compressedSizeInBytes, + @JsonProperty("compressionRatio") double compressionRatio, + @JsonProperty("codec") @Nullable String codec, + @JsonProperty("hasDictionary") boolean hasDictionary, + @JsonProperty("indexes") @Nullable List indexes) { + _column = column; + _uncompressedSizeInBytes = uncompressedSizeInBytes; + _compressedSizeInBytes = compressedSizeInBytes; + _compressionRatio = compressionRatio; + _codec = codec; + _hasDictionary = hasDictionary; + _indexes = indexes; + } + + public String getColumn() { + return _column; + } + + public long getUncompressedSizeInBytes() { + return _uncompressedSizeInBytes; + } + + public long getCompressedSizeInBytes() { + return _compressedSizeInBytes; + } + + public double getCompressionRatio() { + return _compressionRatio; + } + + @Nullable + public String getCodec() { + return _codec; + } + + @JsonProperty("hasDictionary") + public boolean hasDictionary() { + return _hasDictionary; + } + + @Nullable + @JsonInclude(JsonInclude.Include.NON_NULL) + public List getIndexes() { + return _indexes; + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/CompressionStatsSummary.java b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/CompressionStatsSummary.java new file mode 100644 index 000000000000..beb44bb90977 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/CompressionStatsSummary.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; + + +/** + * Table-level compression statistics summary, aggregated from per-column data. + * Contains total raw and compressed forward index sizes, the overall compression ratio, + * and segment coverage information. + * + *

JSON schema is identical to {@code TableSizeReader.CompressionStats} on the size endpoint. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class CompressionStatsSummary { + private final long _rawForwardIndexSizePerReplicaInBytes; + private final long _compressedForwardIndexSizePerReplicaInBytes; + private final double _compressionRatio; + private final int _segmentsWithStats; + private final int _totalSegments; + private final boolean _isPartialCoverage; + + @JsonCreator + public CompressionStatsSummary( + @JsonProperty("rawForwardIndexSizePerReplicaInBytes") long rawForwardIndexSizePerReplicaInBytes, + @JsonProperty("compressedForwardIndexSizePerReplicaInBytes") long compressedForwardIndexSizePerReplicaInBytes, + @JsonProperty("compressionRatio") double compressionRatio, + @JsonProperty("segmentsWithStats") int segmentsWithStats, + @JsonProperty("totalSegments") int totalSegments, + @JsonProperty("isPartialCoverage") boolean isPartialCoverage) { + _rawForwardIndexSizePerReplicaInBytes = rawForwardIndexSizePerReplicaInBytes; + _compressedForwardIndexSizePerReplicaInBytes = compressedForwardIndexSizePerReplicaInBytes; + _compressionRatio = compressionRatio; + _segmentsWithStats = segmentsWithStats; + _totalSegments = totalSegments; + _isPartialCoverage = isPartialCoverage; + } + + public long getRawForwardIndexSizePerReplicaInBytes() { + return _rawForwardIndexSizePerReplicaInBytes; + } + + public long getCompressedForwardIndexSizePerReplicaInBytes() { + return _compressedForwardIndexSizePerReplicaInBytes; + } + + public double getCompressionRatio() { + return _compressionRatio; + } + + public int getSegmentsWithStats() { + return _segmentsWithStats; + } + + public int getTotalSegments() { + return _totalSegments; + } + + @JsonProperty("isPartialCoverage") + public boolean isPartialCoverage() { + return _isPartialCoverage; + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/SegmentSizeInfo.java b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/SegmentSizeInfo.java index 6a9fdf59e0c7..e54ac824a870 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/SegmentSizeInfo.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/SegmentSizeInfo.java @@ -21,18 +21,42 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.Map; +import javax.annotation.Nullable; @JsonIgnoreProperties(ignoreUnknown = true) public class SegmentSizeInfo { private final String _segmentName; private final long _diskSizeInBytes; + private final long _rawForwardIndexSizeBytes; + private final long _compressedForwardIndexSizeBytes; + private final String _tier; + private final Map _columnCompressionStats; + + public SegmentSizeInfo(String segmentName, long sizeBytes) { + this(segmentName, sizeBytes, -1, -1, null, null); + } + + public SegmentSizeInfo(String segmentName, long sizeBytes, long rawForwardIndexSizeBytes, + long compressedForwardIndexSizeBytes, @Nullable String tier) { + this(segmentName, sizeBytes, rawForwardIndexSizeBytes, compressedForwardIndexSizeBytes, tier, null); + } @JsonCreator public SegmentSizeInfo(@JsonProperty("segmentName") String segmentName, - @JsonProperty("diskSizeInBytes") long sizeBytes) { + @JsonProperty("diskSizeInBytes") long sizeBytes, + @JsonProperty("rawForwardIndexSizeBytes") long rawForwardIndexSizeBytes, + @JsonProperty("compressedForwardIndexSizeBytes") long compressedForwardIndexSizeBytes, + @JsonProperty("tier") @Nullable String tier, + @JsonProperty("columnCompressionStats") @Nullable Map + columnCompressionStats) { _segmentName = segmentName; _diskSizeInBytes = sizeBytes; + _rawForwardIndexSizeBytes = rawForwardIndexSizeBytes; + _compressedForwardIndexSizeBytes = compressedForwardIndexSizeBytes; + _tier = tier; + _columnCompressionStats = columnCompressionStats; } public String getSegmentName() { @@ -43,6 +67,24 @@ public long getDiskSizeInBytes() { return _diskSizeInBytes; } + public long getRawForwardIndexSizeBytes() { + return _rawForwardIndexSizeBytes; + } + + public long getCompressedForwardIndexSizeBytes() { + return _compressedForwardIndexSizeBytes; + } + + @Nullable + public String getTier() { + return _tier; + } + + @Nullable + public Map getColumnCompressionStats() { + return _columnCompressionStats; + } + @Override public boolean equals(Object o) { if (this == o) { diff --git a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/StorageBreakdownInfo.java b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/StorageBreakdownInfo.java new file mode 100644 index 000000000000..480e411e01dd --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/StorageBreakdownInfo.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.Map; + + +/** + * Storage breakdown by tier. Contains a map of tier names to their respective + * segment count and per-replica size. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class StorageBreakdownInfo { + + private final Map _tiers; + + @JsonCreator + public StorageBreakdownInfo(@JsonProperty("tiers") Map tiers) { + _tiers = tiers; + } + + public Map getTiers() { + return _tiers; + } + + /** + * Segment count and size for a single storage tier. + */ + @JsonIgnoreProperties(ignoreUnknown = true) + public static class TierInfo { + private final int _count; + private final long _sizePerReplicaInBytes; + + @JsonCreator + public TierInfo(@JsonProperty("count") int count, + @JsonProperty("sizePerReplicaInBytes") long sizePerReplicaInBytes) { + _count = count; + _sizePerReplicaInBytes = sizePerReplicaInBytes; + } + + public int getCount() { + return _count; + } + + public long getSizePerReplicaInBytes() { + return _sizePerReplicaInBytes; + } + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java index 21468d7d426a..86e42745303b 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java @@ -20,8 +20,11 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.List; import java.util.Map; +import javax.annotation.Nullable; /** @@ -46,6 +49,9 @@ public class TableMetadataInfo { // JSON property name kept as "upsertPartitionToServerPrimaryKeyCountMap" to avoid silent data loss during rolling // upgrades where servers and controllers may temporarily run different versions of this class. private final Map> _partitionToServerPrimaryKeyCountMap; + private final List _columnCompressionStats; + private final CompressionStatsSummary _compressionStats; + private final StorageBreakdownInfo _storageBreakdown; @JsonCreator public TableMetadataInfo(@JsonProperty("tableName") String tableName, @@ -55,7 +61,11 @@ public TableMetadataInfo(@JsonProperty("tableName") String tableName, @JsonProperty("maxNumMultiValuesMap") Map maxNumMultiValuesMap, @JsonProperty("columnIndexSizeMap") Map> columnIndexSizeMap, @JsonProperty("upsertPartitionToServerPrimaryKeyCountMap") - Map> partitionToServerPrimaryKeyCountMap) { + Map> partitionToServerPrimaryKeyCountMap, + @JsonProperty("columnCompressionStats") @Nullable + List columnCompressionStats, + @JsonProperty("compressionStats") @Nullable CompressionStatsSummary compressionStats, + @JsonProperty("storageBreakdown") @Nullable StorageBreakdownInfo storageBreakdown) { _tableName = tableName; _diskSizeInBytes = sizeInBytes; _numSegments = numSegments; @@ -65,6 +75,32 @@ public TableMetadataInfo(@JsonProperty("tableName") String tableName, _maxNumMultiValuesMap = maxNumMultiValuesMap; _columnIndexSizeMap = columnIndexSizeMap; _partitionToServerPrimaryKeyCountMap = partitionToServerPrimaryKeyCountMap; + _columnCompressionStats = columnCompressionStats; + _compressionStats = compressionStats; + _storageBreakdown = storageBreakdown; + } + + /** + * Constructor for callers that provide columnCompressionStats but not compressionStats/storageBreakdown. + */ + public TableMetadataInfo(String tableName, long sizeInBytes, long numSegments, long numRows, + Map columnLengthMap, Map columnCardinalityMap, + Map maxNumMultiValuesMap, Map> columnIndexSizeMap, + Map> partitionToServerPrimaryKeyCountMap, + @Nullable List columnCompressionStats) { + this(tableName, sizeInBytes, numSegments, numRows, columnLengthMap, columnCardinalityMap, maxNumMultiValuesMap, + columnIndexSizeMap, partitionToServerPrimaryKeyCountMap, columnCompressionStats, null, null); + } + + /** + * Backwards-compatible constructor for callers that don't provide any compression/storage fields. + */ + public TableMetadataInfo(String tableName, long sizeInBytes, long numSegments, long numRows, + Map columnLengthMap, Map columnCardinalityMap, + Map maxNumMultiValuesMap, Map> columnIndexSizeMap, + Map> partitionToServerPrimaryKeyCountMap) { + this(tableName, sizeInBytes, numSegments, numRows, columnLengthMap, columnCardinalityMap, maxNumMultiValuesMap, + columnIndexSizeMap, partitionToServerPrimaryKeyCountMap, null, null, null); } public String getTableName() { @@ -103,4 +139,22 @@ public Map> getColumnIndexSizeMap() { public Map> getPartitionToServerPrimaryKeyCountMap() { return _partitionToServerPrimaryKeyCountMap; } + + @Nullable + @JsonInclude(JsonInclude.Include.NON_NULL) + public List getColumnCompressionStats() { + return _columnCompressionStats; + } + + @Nullable + @JsonInclude(JsonInclude.Include.NON_NULL) + public CompressionStatsSummary getCompressionStats() { + return _compressionStats; + } + + @Nullable + @JsonInclude(JsonInclude.Include.NON_NULL) + public StorageBreakdownInfo getStorageBreakdown() { + return _storageBreakdown; + } } diff --git a/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/ColumnCompressionStatsInfoTest.java b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/ColumnCompressionStatsInfoTest.java new file mode 100644 index 000000000000..6e0ba1429f8a --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/ColumnCompressionStatsInfoTest.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import java.util.Arrays; +import java.util.List; +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class ColumnCompressionStatsInfoTest { + + @Test + public void testGetters() { + List indexes = Arrays.asList("forward_index", "inverted_index"); + ColumnCompressionStatsInfo info = + new ColumnCompressionStatsInfo("myCol", 8000L, 2000L, 4.0, "LZ4", false, indexes); + + assertEquals(info.getColumn(), "myCol"); + assertEquals(info.getUncompressedSizeInBytes(), 8000L); + assertEquals(info.getCompressedSizeInBytes(), 2000L); + assertEquals(info.getCompressionRatio(), 4.0, 1e-9); + assertEquals(info.getCodec(), "LZ4"); + assertFalse(info.hasDictionary()); + assertEquals(info.getIndexes(), indexes); + } + + @Test + public void testHasDictionaryTrue() { + ColumnCompressionStatsInfo info = + new ColumnCompressionStatsInfo("dictCol", 5000L, 1000L, 5.0, "SNAPPY", true, + List.of("forward_index")); + + assertTrue(info.hasDictionary()); + assertEquals(info.getCodec(), "SNAPPY"); + } + + @Test + public void testJsonRoundTrip() + throws Exception { + List indexes = Arrays.asList("forward_index", "range_index"); + ColumnCompressionStatsInfo original = + new ColumnCompressionStatsInfo("col1", 10000L, 2500L, 4.0, "ZSTANDARD", false, indexes); + + String json = JsonUtils.objectToString(original); + ColumnCompressionStatsInfo deserialized = + JsonUtils.stringToObject(json, ColumnCompressionStatsInfo.class); + + assertEquals(deserialized.getColumn(), "col1"); + assertEquals(deserialized.getUncompressedSizeInBytes(), 10000L); + assertEquals(deserialized.getCompressedSizeInBytes(), 2500L); + assertEquals(deserialized.getCompressionRatio(), 4.0, 1e-9); + assertEquals(deserialized.getCodec(), "ZSTANDARD"); + assertFalse(deserialized.hasDictionary()); + assertNotNull(deserialized.getIndexes()); + assertEquals(deserialized.getIndexes().size(), 2); + assertTrue(deserialized.getIndexes().contains("forward_index")); + assertTrue(deserialized.getIndexes().contains("range_index")); + } + + @Test + public void testNullCodecAndNullIndexesRoundTrip() + throws Exception { + ColumnCompressionStatsInfo original = + new ColumnCompressionStatsInfo("noCodecCol", 3000L, 1500L, 2.0, null, false, null); + + String json = JsonUtils.objectToString(original); + ColumnCompressionStatsInfo deserialized = + JsonUtils.stringToObject(json, ColumnCompressionStatsInfo.class); + + assertEquals(deserialized.getColumn(), "noCodecCol"); + assertEquals(deserialized.getUncompressedSizeInBytes(), 3000L); + assertEquals(deserialized.getCompressedSizeInBytes(), 1500L); + assertEquals(deserialized.getCompressionRatio(), 2.0, 1e-9); + assertNull(deserialized.getCodec()); + assertFalse(deserialized.hasDictionary()); + assertNull(deserialized.getIndexes()); + } + + @Test + public void testJsonIgnoresUnknownFields() + throws Exception { + String json = "{\"column\":\"futureCol\",\"uncompressedSizeInBytes\":6000," + + "\"compressedSizeInBytes\":1200,\"compressionRatio\":5.0," + + "\"codec\":\"LZ4\",\"hasDictionary\":false," + + "\"indexes\":[\"forward_index\"],\"unknownField\":\"ignored\"}"; + + ColumnCompressionStatsInfo deserialized = + JsonUtils.stringToObject(json, ColumnCompressionStatsInfo.class); + + assertEquals(deserialized.getColumn(), "futureCol"); + assertEquals(deserialized.getUncompressedSizeInBytes(), 6000L); + assertEquals(deserialized.getCompressedSizeInBytes(), 1200L); + assertEquals(deserialized.getCompressionRatio(), 5.0, 1e-9); + assertEquals(deserialized.getCodec(), "LZ4"); + assertFalse(deserialized.hasDictionary()); + assertNotNull(deserialized.getIndexes()); + assertEquals(deserialized.getIndexes(), List.of("forward_index")); + } + + @Test + public void testHasDictionaryJsonRoundTrip() + throws Exception { + ColumnCompressionStatsInfo original = + new ColumnCompressionStatsInfo("dictRoundTrip", 7000L, 3500L, 2.0, null, true, + List.of("forward_index")); + + String json = JsonUtils.objectToString(original); + ColumnCompressionStatsInfo deserialized = + JsonUtils.stringToObject(json, ColumnCompressionStatsInfo.class); + + assertEquals(deserialized.getColumn(), "dictRoundTrip"); + assertTrue(deserialized.hasDictionary()); + assertNull(deserialized.getCodec()); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/CompressionStatsSummaryTest.java b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/CompressionStatsSummaryTest.java new file mode 100644 index 000000000000..69bb227bd85a --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/CompressionStatsSummaryTest.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class CompressionStatsSummaryTest { + + @Test + public void testGetters() { + CompressionStatsSummary summary = new CompressionStatsSummary(100000, 40000, 2.5, 8, 10, true); + assertEquals(summary.getRawForwardIndexSizePerReplicaInBytes(), 100000); + assertEquals(summary.getCompressedForwardIndexSizePerReplicaInBytes(), 40000); + assertEquals(summary.getCompressionRatio(), 2.5, 0.001); + assertEquals(summary.getSegmentsWithStats(), 8); + assertEquals(summary.getTotalSegments(), 10); + assertTrue(summary.isPartialCoverage()); + } + + @Test + public void testFullCoverage() { + CompressionStatsSummary summary = new CompressionStatsSummary(50000, 25000, 2.0, 5, 5, false); + assertEquals(summary.getSegmentsWithStats(), 5); + assertEquals(summary.getTotalSegments(), 5); + assertFalse(summary.isPartialCoverage()); + } + + @Test + public void testJsonRoundTrip() + throws Exception { + CompressionStatsSummary original = new CompressionStatsSummary(200000, 80000, 2.5, 3, 4, true); + String json = JsonUtils.objectToString(original); + + assertTrue(json.contains("rawForwardIndexSizePerReplicaInBytes")); + assertTrue(json.contains("compressedForwardIndexSizePerReplicaInBytes")); + assertTrue(json.contains("compressionRatio")); + assertTrue(json.contains("segmentsWithStats")); + assertTrue(json.contains("totalSegments")); + assertTrue(json.contains("isPartialCoverage")); + + CompressionStatsSummary deserialized = JsonUtils.stringToObject(json, CompressionStatsSummary.class); + assertEquals(deserialized.getRawForwardIndexSizePerReplicaInBytes(), 200000); + assertEquals(deserialized.getCompressedForwardIndexSizePerReplicaInBytes(), 80000); + assertEquals(deserialized.getCompressionRatio(), 2.5, 0.001); + assertEquals(deserialized.getSegmentsWithStats(), 3); + assertEquals(deserialized.getTotalSegments(), 4); + assertTrue(deserialized.isPartialCoverage()); + } + + @Test + public void testJsonIgnoresUnknownFields() + throws Exception { + String json = "{\"rawForwardIndexSizePerReplicaInBytes\":1000,\"compressedForwardIndexSizePerReplicaInBytes\":500," + + "\"compressionRatio\":2.0,\"segmentsWithStats\":1,\"totalSegments\":1,\"isPartialCoverage\":false," + + "\"unknownFutureField\":\"ignored\"}"; + CompressionStatsSummary summary = JsonUtils.stringToObject(json, CompressionStatsSummary.class); + assertEquals(summary.getRawForwardIndexSizePerReplicaInBytes(), 1000); + assertFalse(summary.isPartialCoverage()); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/SegmentSizeInfoTest.java b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/SegmentSizeInfoTest.java new file mode 100644 index 000000000000..7c48e3e7a753 --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/SegmentSizeInfoTest.java @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class SegmentSizeInfoTest { + + @Test + public void testJsonRoundTripWithCompressionStats() + throws Exception { + Map columnStats = new HashMap<>(); + columnStats.put("col1", new ColumnCompressionStatsInfo("col1", 10000, 2500, 4.0, "LZ4", false, + List.of("forward_index"))); + columnStats.put("col2", new ColumnCompressionStatsInfo("col2", 20000, 4000, 5.0, "ZSTANDARD", false, + List.of("forward_index"))); + + SegmentSizeInfo original = new SegmentSizeInfo("seg1", 50000, 30000, 6500, "tier1", columnStats); + + String json = JsonUtils.objectToString(original); + SegmentSizeInfo deserialized = JsonUtils.stringToObject(json, SegmentSizeInfo.class); + + assertEquals(deserialized.getSegmentName(), "seg1"); + assertEquals(deserialized.getDiskSizeInBytes(), 50000); + assertEquals(deserialized.getRawForwardIndexSizeBytes(), 30000); + assertEquals(deserialized.getCompressedForwardIndexSizeBytes(), 6500); + assertEquals(deserialized.getTier(), "tier1"); + assertNotNull(deserialized.getColumnCompressionStats()); + assertEquals(deserialized.getColumnCompressionStats().size(), 2); + + ColumnCompressionStatsInfo col1Stats = deserialized.getColumnCompressionStats().get("col1"); + assertNotNull(col1Stats); + assertEquals(col1Stats.getColumn(), "col1"); + assertEquals(col1Stats.getUncompressedSizeInBytes(), 10000); + assertEquals(col1Stats.getCompressedSizeInBytes(), 2500); + assertEquals(col1Stats.getCompressionRatio(), 4.0, 0.01); + assertEquals(col1Stats.getCodec(), "LZ4"); + assertFalse(col1Stats.hasDictionary()); + } + + @Test + public void testJsonRoundTripBackwardCompatible() + throws Exception { + // Simulate old server response without compression fields + String oldJson = "{\"segmentName\":\"seg1\",\"diskSizeInBytes\":50000}"; + SegmentSizeInfo deserialized = JsonUtils.stringToObject(oldJson, SegmentSizeInfo.class); + + assertEquals(deserialized.getSegmentName(), "seg1"); + assertEquals(deserialized.getDiskSizeInBytes(), 50000); + assertEquals(deserialized.getRawForwardIndexSizeBytes(), 0); + assertEquals(deserialized.getCompressedForwardIndexSizeBytes(), 0); + assertNull(deserialized.getTier()); + assertNull(deserialized.getColumnCompressionStats()); + } + + @Test + public void testJsonRoundTripWithoutColumnStats() + throws Exception { + SegmentSizeInfo original = new SegmentSizeInfo("seg1", 50000, 30000, 6500, "default"); + + String json = JsonUtils.objectToString(original); + SegmentSizeInfo deserialized = JsonUtils.stringToObject(json, SegmentSizeInfo.class); + + assertEquals(deserialized.getSegmentName(), "seg1"); + assertEquals(deserialized.getDiskSizeInBytes(), 50000); + assertEquals(deserialized.getRawForwardIndexSizeBytes(), 30000); + assertEquals(deserialized.getCompressedForwardIndexSizeBytes(), 6500); + assertEquals(deserialized.getTier(), "default"); + assertNull(deserialized.getColumnCompressionStats()); + } + + @Test + public void testLegacyTwoArgConstructor() { + SegmentSizeInfo info = new SegmentSizeInfo("seg1", 1000); + assertEquals(info.getSegmentName(), "seg1"); + assertEquals(info.getDiskSizeInBytes(), 1000); + assertEquals(info.getRawForwardIndexSizeBytes(), -1); + assertEquals(info.getCompressedForwardIndexSizeBytes(), -1); + assertNull(info.getTier()); + assertNull(info.getColumnCompressionStats()); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/StorageBreakdownInfoTest.java b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/StorageBreakdownInfoTest.java new file mode 100644 index 000000000000..92870a4cdb60 --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/StorageBreakdownInfoTest.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class StorageBreakdownInfoTest { + + @Test + public void testTierInfoGetters() { + StorageBreakdownInfo.TierInfo tierInfo = new StorageBreakdownInfo.TierInfo(5, 1048576L); + + assertEquals(tierInfo.getCount(), 5); + assertEquals(tierInfo.getSizePerReplicaInBytes(), 1048576L); + } + + @Test + public void testGetTiersMap() { + Map tiers = new HashMap<>(); + tiers.put("hotTier", new StorageBreakdownInfo.TierInfo(3, 2000000L)); + tiers.put("coldTier", new StorageBreakdownInfo.TierInfo(7, 8000000L)); + + StorageBreakdownInfo info = new StorageBreakdownInfo(tiers); + + assertNotNull(info.getTiers()); + assertEquals(info.getTiers().size(), 2); + assertEquals(info.getTiers().get("hotTier").getCount(), 3); + assertEquals(info.getTiers().get("hotTier").getSizePerReplicaInBytes(), 2000000L); + assertEquals(info.getTiers().get("coldTier").getCount(), 7); + assertEquals(info.getTiers().get("coldTier").getSizePerReplicaInBytes(), 8000000L); + } + + @Test + public void testJsonRoundTripWithMultipleTiers() + throws Exception { + Map tiers = new HashMap<>(); + tiers.put("tier1", new StorageBreakdownInfo.TierInfo(10, 5000000L)); + tiers.put("tier2", new StorageBreakdownInfo.TierInfo(4, 1500000L)); + + StorageBreakdownInfo original = new StorageBreakdownInfo(tiers); + + String json = JsonUtils.objectToString(original); + StorageBreakdownInfo deserialized = JsonUtils.stringToObject(json, StorageBreakdownInfo.class); + + assertNotNull(deserialized.getTiers()); + assertEquals(deserialized.getTiers().size(), 2); + + StorageBreakdownInfo.TierInfo tier1 = deserialized.getTiers().get("tier1"); + assertNotNull(tier1); + assertEquals(tier1.getCount(), 10); + assertEquals(tier1.getSizePerReplicaInBytes(), 5000000L); + + StorageBreakdownInfo.TierInfo tier2 = deserialized.getTiers().get("tier2"); + assertNotNull(tier2); + assertEquals(tier2.getCount(), 4); + assertEquals(tier2.getSizePerReplicaInBytes(), 1500000L); + } + + @Test + public void testJsonRoundTripEmptyTiers() + throws Exception { + StorageBreakdownInfo original = new StorageBreakdownInfo(Collections.emptyMap()); + + String json = JsonUtils.objectToString(original); + StorageBreakdownInfo deserialized = JsonUtils.stringToObject(json, StorageBreakdownInfo.class); + + assertNotNull(deserialized.getTiers()); + assertTrue(deserialized.getTiers().isEmpty()); + } + + @Test + public void testJsonIgnoresUnknownFieldsOnStorageBreakdownInfo() + throws Exception { + String json = "{\"tiers\":{\"hotTier\":{\"count\":2,\"sizePerReplicaInBytes\":900000}}," + + "\"unknownTopField\":\"ignored\"}"; + + StorageBreakdownInfo deserialized = JsonUtils.stringToObject(json, StorageBreakdownInfo.class); + + assertNotNull(deserialized.getTiers()); + assertEquals(deserialized.getTiers().size(), 1); + assertEquals(deserialized.getTiers().get("hotTier").getCount(), 2); + assertEquals(deserialized.getTiers().get("hotTier").getSizePerReplicaInBytes(), 900000L); + } + + @Test + public void testJsonIgnoresUnknownFieldsOnTierInfo() + throws Exception { + String json = "{\"tiers\":{\"tier1\":{\"count\":3,\"sizePerReplicaInBytes\":4000000," + + "\"futureField\":\"ignored\"}}}"; + + StorageBreakdownInfo deserialized = JsonUtils.stringToObject(json, StorageBreakdownInfo.class); + + assertNotNull(deserialized.getTiers()); + StorageBreakdownInfo.TierInfo tierInfo = deserialized.getTiers().get("tier1"); + assertNotNull(tierInfo); + assertEquals(tierInfo.getCount(), 3); + assertEquals(tierInfo.getSizePerReplicaInBytes(), 4000000L); + } + + @Test + public void testNullTiersMap() + throws Exception { + StorageBreakdownInfo original = new StorageBreakdownInfo(null); + + String json = JsonUtils.objectToString(original); + StorageBreakdownInfo deserialized = JsonUtils.stringToObject(json, StorageBreakdownInfo.class); + + assertNull(deserialized.getTiers()); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/TableMetadataInfoCompressionTest.java b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/TableMetadataInfoCompressionTest.java new file mode 100644 index 000000000000..1459e43b5b4d --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/restlet/resources/TableMetadataInfoCompressionTest.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.restlet.resources; + +import com.fasterxml.jackson.databind.JsonNode; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Tests the TableMetadataInfo response schema for compression stats (T056/T057). + * Validates server-side response includes columnCompressionStats array when present + * and suppresses it (via NON_NULL) when absent. + */ +public class TableMetadataInfoCompressionTest { + + @Test + public void testSerializationWithCompressionStats() + throws Exception { + List colStats = new ArrayList<>(); + colStats.add(new ColumnCompressionStatsInfo("col_a", 10000, 2000, 5.0, "LZ4", false, List.of("forward_index"))); + colStats.add(new ColumnCompressionStatsInfo("col_b", 20000, 5000, 4.0, "ZSTANDARD", false, + List.of("forward_index", "inverted_index"))); + + TableMetadataInfo info = new TableMetadataInfo("testTable", 50000, 3, 1000, + Map.of("col_a", 4.0), Map.of("col_a", 50.0), Map.of(), Map.of(), Map.of(), colStats); + + String json = JsonUtils.objectToString(info); + JsonNode node = JsonUtils.stringToJsonNode(json); + + // columnCompressionStats should be present as an array + assertTrue(node.has("columnCompressionStats")); + JsonNode colStatsNode = node.get("columnCompressionStats"); + assertTrue(colStatsNode.isArray(), "columnCompressionStats should be a JSON array"); + assertEquals(colStatsNode.size(), 2); + + // Validate col_a values (first element) + JsonNode colA = colStatsNode.get(0); + assertEquals(colA.get("column").asText(), "col_a"); + assertEquals(colA.get("uncompressedSizeInBytes").asLong(), 10000); + assertEquals(colA.get("compressedSizeInBytes").asLong(), 2000); + assertEquals(colA.get("compressionRatio").asDouble(), 5.0, 0.01); + assertEquals(colA.get("codec").asText(), "LZ4"); + assertFalse(colA.get("hasDictionary").asBoolean()); + assertTrue(colA.has("indexes")); + + // Validate col_b values (second element) + JsonNode colB = colStatsNode.get(1); + assertEquals(colB.get("column").asText(), "col_b"); + assertEquals(colB.get("uncompressedSizeInBytes").asLong(), 20000); + assertEquals(colB.get("compressedSizeInBytes").asLong(), 5000); + assertEquals(colB.get("compressionRatio").asDouble(), 4.0, 0.01); + assertEquals(colB.get("codec").asText(), "ZSTANDARD"); + assertFalse(colB.get("hasDictionary").asBoolean()); + assertEquals(colB.get("indexes").size(), 2); + } + + @Test + public void testSerializationWithoutCompressionStats() + throws Exception { + // Use backwards-compatible constructor (no compression stats) + TableMetadataInfo info = new TableMetadataInfo("testTable", 50000, 3, 1000, + Map.of("col_a", 4.0), Map.of("col_a", 50.0), Map.of(), Map.of(), Map.of()); + + String json = JsonUtils.objectToString(info); + JsonNode node = JsonUtils.stringToJsonNode(json); + + // columnCompressionStats should be absent (suppressed by NON_NULL) + assertFalse(node.has("columnCompressionStats"), + "columnCompressionStats should be suppressed from JSON when null"); + } + + @Test + public void testDeserializationRoundTrip() + throws Exception { + List colStats = new ArrayList<>(); + colStats.add(new ColumnCompressionStatsInfo("metric_col", 50000, 8000, 6.25, "SNAPPY", false, + List.of("forward_index"))); + + TableMetadataInfo original = new TableMetadataInfo("roundTripTable", 100000, 5, 5000, + Map.of("metric_col", 8.0), Map.of("metric_col", 100.0), Map.of(), Map.of(), Map.of(), colStats); + + String json = JsonUtils.objectToString(original); + TableMetadataInfo deserialized = JsonUtils.stringToObject(json, TableMetadataInfo.class); + + assertEquals(deserialized.getTableName(), "roundTripTable"); + assertEquals(deserialized.getDiskSizeInBytes(), 100000); + assertNotNull(deserialized.getColumnCompressionStats()); + assertEquals(deserialized.getColumnCompressionStats().size(), 1); + + ColumnCompressionStatsInfo stats = deserialized.getColumnCompressionStats().get(0); + assertNotNull(stats); + assertEquals(stats.getColumn(), "metric_col"); + assertEquals(stats.getUncompressedSizeInBytes(), 50000); + assertEquals(stats.getCompressedSizeInBytes(), 8000); + assertEquals(stats.getCompressionRatio(), 6.25, 0.01); + assertEquals(stats.getCodec(), "SNAPPY"); + assertFalse(stats.hasDictionary()); + assertNotNull(stats.getIndexes()); + } + + @Test + public void testBackwardCompatDeserialization() + throws Exception { + // Simulate JSON from an old server that doesn't include columnCompressionStats + String oldJson = "{\"tableName\":\"oldTable\",\"diskSizeInBytes\":30000," + + "\"numSegments\":2,\"numRows\":500," + + "\"columnLengthMap\":{\"col\":4.0}," + + "\"columnCardinalityMap\":{\"col\":10.0}," + + "\"maxNumMultiValuesMap\":{}," + + "\"columnIndexSizeMap\":{}," + + "\"upsertPartitionToServerPrimaryKeyCountMap\":{}}"; + + TableMetadataInfo info = JsonUtils.stringToObject(oldJson, TableMetadataInfo.class); + assertNotNull(info); + assertEquals(info.getTableName(), "oldTable"); + assertEquals(info.getDiskSizeInBytes(), 30000); + // columnCompressionStats should be null (not present in old JSON) + assertNull(info.getColumnCompressionStats()); + } +} diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableRestletResource.java index 91985f5997ea..38f92894d76f 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableRestletResource.java @@ -1270,9 +1270,14 @@ public String getTableAggregateMetadata( TableConfig tableConfig = _pinotHelixResourceManager.getTableConfig(tableNameWithType); int numReplica = tableConfig == null ? 1 : tableConfig.getReplication(); + // Check feature flag — suppress columnCompressionStats when disabled + boolean compressionStatsEnabled = tableConfig != null && tableConfig.getIndexingConfig() != null + && tableConfig.getIndexingConfig().isCompressionStatsEnabled(); + String segmentsMetadata; try { - JsonNode segmentsMetadataJson = getAggregateMetadataFromServer(tableNameWithType, columns, numReplica); + JsonNode segmentsMetadataJson = + getAggregateMetadataFromServer(tableNameWithType, columns, numReplica, compressionStatsEnabled); segmentsMetadata = JsonUtils.objectToPrettyString(segmentsMetadataJson); } catch (InvalidConfigException e) { throw new ControllerApplicationException(LOGGER, e.getMessage(), Response.Status.BAD_REQUEST); @@ -1322,9 +1327,14 @@ public String getTableAggregateMetadataDeprecated( } } + // Check feature flag — suppress columnCompressionStats when disabled + boolean compressionStatsEnabled = tableConfig != null && tableConfig.getIndexingConfig() != null + && tableConfig.getIndexingConfig().isCompressionStatsEnabled(); + try { JsonNode segmentsMetadataJson = - getAggregateMetadataFromServer(existingTableNameWithType, columnsList, numReplica); + getAggregateMetadataFromServer(existingTableNameWithType, columnsList, numReplica, + compressionStatsEnabled); return JsonUtils.objectToPrettyString(segmentsMetadataJson); } catch (InvalidConfigException e) { throw new ControllerApplicationException(LOGGER, e.getMessage(), Response.Status.BAD_REQUEST); @@ -1454,12 +1464,13 @@ private JsonNode getAggregateIndexMetadataFromServer(String tableNameWithType) * @param numReplica num or replica for the table * @return aggregated metadata of the table segments */ - private JsonNode getAggregateMetadataFromServer(String tableNameWithType, List columns, int numReplica) + private JsonNode getAggregateMetadataFromServer(String tableNameWithType, List columns, int numReplica, + boolean compressionStatsEnabled) throws InvalidConfigException, IOException { TableMetadataReader tableMetadataReader = new TableMetadataReader(_executor, _connectionManager, _pinotHelixResourceManager); return tableMetadataReader.getAggregateTableMetadata(tableNameWithType, columns, numReplica, - _controllerConf.getServerAdminRequestTimeoutSeconds() * 1000); + _controllerConf.getServerAdminRequestTimeoutSeconds() * 1000, compressionStatsEnabled); } @GET diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java index ab20f6fb7453..f352a3269889 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java @@ -500,6 +500,8 @@ protected void nonLeaderCleanup(List tableNamesWithType) { private void removeMetricsForTable(String tableNameWithType) { LOGGER.info("Removing metrics from {} given it is not a table known by Helix", tableNameWithType); + // Remove tier-suffixed gauges that use composite keys (tableName.tierKey) + _tableSizeReader.clearTierMetrics(tableNameWithType); for (ControllerGauge metric : ControllerGauge.values()) { if (!metric.isGlobal()) { _controllerMetrics.removeTableGauge(tableNameWithType, metric); diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java b/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java index a1669a2882b8..ca89cb6d2684 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java @@ -42,6 +42,9 @@ import org.apache.commons.lang3.tuple.Pair; import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; import org.apache.hc.client5.http.io.HttpClientConnectionManager; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; +import org.apache.pinot.common.restlet.resources.CompressionStatsSummary; +import org.apache.pinot.common.restlet.resources.StorageBreakdownInfo; import org.apache.pinot.common.restlet.resources.TableMetadataInfo; import org.apache.pinot.common.restlet.resources.TableSegments; import org.apache.pinot.common.restlet.resources.ValidDocIdsBitmapResponse; @@ -93,7 +96,8 @@ public ServerSegmentMetadataReader(Executor executor, HttpClientConnectionManage * table. */ public TableMetadataInfo getAggregatedTableMetadataFromServer(String tableNameWithType, - BiMap serverEndPoints, List columns, int numReplica, int timeoutMs) { + BiMap serverEndPoints, List columns, int numReplica, int timeoutMs, + boolean compressionStatsEnabled) { int numServers = serverEndPoints.size(); LOGGER.info("Reading aggregated segment metadata from {} servers for table: {} with timeout: {}ms", numServers, tableNameWithType, timeoutMs); @@ -120,6 +124,17 @@ public TableMetadataInfo getAggregatedTableMetadataFromServer(String tableNameWi final Map maxNumMultiValuesMap = new HashMap<>(); final Map> columnIndexSizeMap = new HashMap<>(); final Map> partitionToServerPrimaryKeyCountMap = new HashMap<>(); + // Per-column compression stats accumulators: [0]=uncompressed, [1]=compressed + final Map columnCompressionAccum = new HashMap<>(); + final Map columnCodecMap = new HashMap<>(); + final Map columnHasDictMap = new HashMap<>(); + final Map> columnIndexNamesMap = new HashMap<>(); + long aggRawSize = 0; + long aggCompressedSize = 0; + int aggSegmentsWithStats = 0; + int aggTotalSegments = 0; + boolean hasCompressionSummary = false; + final Map tierAccum = new HashMap<>(); // [count, size] for (Map.Entry streamResponse : serviceResponse._httpResponses.entrySet()) { try { TableMetadataInfo tableMetadataInfo = @@ -144,6 +159,50 @@ public TableMetadataInfo getAggregatedTableMetadataFromServer(String tableNameWi } return l; })); + // Aggregate per-column compression stats from server responses + List serverColStats = tableMetadataInfo.getColumnCompressionStats(); + if (serverColStats != null) { + for (ColumnCompressionStatsInfo info : serverColStats) { + // Skip columns with no meaningful compression data (old raw segments without persisted codec) + if (info.getCodec() == null && !info.hasDictionary()) { + continue; + } + String col = info.getColumn(); + long[] accum = columnCompressionAccum.computeIfAbsent(col, k -> new long[2]); + // Only accumulate uncompressed size when it is a real value (not the -1 sentinel from dict columns) + if (info.getUncompressedSizeInBytes() >= 0) { + accum[0] += info.getUncompressedSizeInBytes(); + } + accum[1] += info.getCompressedSizeInBytes(); + if (info.getCodec() != null) { + columnCodecMap.merge(col, info.getCodec(), + (existing, incoming) -> existing.equals(incoming) ? existing : "MIXED"); + } + columnHasDictMap.put(col, info.hasDictionary()); + if (info.getIndexes() != null) { + columnIndexNamesMap.computeIfAbsent(col, k -> new HashSet<>()).addAll(info.getIndexes()); + } + } + } + // Aggregate compressionStats summary (sum raw/compressed across servers) + CompressionStatsSummary serverSummary = tableMetadataInfo.getCompressionStats(); + if (serverSummary != null) { + aggRawSize += serverSummary.getRawForwardIndexSizePerReplicaInBytes(); + aggCompressedSize += serverSummary.getCompressedForwardIndexSizePerReplicaInBytes(); + aggSegmentsWithStats += serverSummary.getSegmentsWithStats(); + aggTotalSegments += serverSummary.getTotalSegments(); + hasCompressionSummary = true; + } + // Aggregate storageBreakdown (sum counts and sizes per tier) + StorageBreakdownInfo serverBreakdown = tableMetadataInfo.getStorageBreakdown(); + if (serverBreakdown != null && serverBreakdown.getTiers() != null) { + for (Map.Entry tierEntry + : serverBreakdown.getTiers().entrySet()) { + long[] vals = tierAccum.computeIfAbsent(tierEntry.getKey(), k -> new long[2]); + vals[0] += tierEntry.getValue().getCount(); + vals[1] += tierEntry.getValue().getSizePerReplicaInBytes(); + } + } } catch (IOException e) { failedParses++; LOGGER.error("Unable to parse server {} response due to an error: ", streamResponse.getKey(), e); @@ -165,9 +224,65 @@ public TableMetadataInfo getAggregatedTableMetadataFromServer(String tableNameWi totalNumSegments /= numReplica; totalNumRows /= numReplica; + // Build per-column compression stats list (divide by numReplica since each replica reports the same stats) + List columnCompressionStats = null; + if (!columnCompressionAccum.isEmpty()) { + columnCompressionStats = new ArrayList<>(); + for (Map.Entry entry : columnCompressionAccum.entrySet()) { + String col = entry.getKey(); + long[] accum = entry.getValue(); + boolean hasDictionary = Boolean.TRUE.equals(columnHasDictMap.get(col)); + // Dict columns have no uncompressed size; preserve -1 sentinel instead of dividing 0 + long uncompressed = (hasDictionary && accum[0] == 0) ? -1 : accum[0] / numReplica; + long compressed = accum[1] / numReplica; + double ratio = (uncompressed > 0 && compressed > 0) ? (double) uncompressed / compressed : 0; + Set idxNames = columnIndexNamesMap.get(col); + List indexes = idxNames != null ? new ArrayList<>(idxNames) : null; + columnCompressionStats.add(new ColumnCompressionStatsInfo( + col, uncompressed, compressed, ratio, columnCodecMap.get(col), hasDictionary, indexes)); + } + columnCompressionStats.sort((a, b) -> a.getColumn().compareTo(b.getColumn())); + } + + // Build aggregated compression summary (divide by numReplica to avoid double counting) + CompressionStatsSummary compressionStatsSummary = null; + if (hasCompressionSummary) { + long rawPerReplica = aggRawSize / numReplica; + long compressedPerReplica = aggCompressedSize / numReplica; + double ratio = (rawPerReplica > 0 && compressedPerReplica > 0) + ? (double) rawPerReplica / compressedPerReplica : 0; + int segmentsWithStats = aggSegmentsWithStats / numReplica; + int totalSegments = aggTotalSegments / numReplica; + if (segmentsWithStats > 0) { + boolean isPartialCoverage = segmentsWithStats < totalSegments; + compressionStatsSummary = new CompressionStatsSummary(rawPerReplica, compressedPerReplica, ratio, + segmentsWithStats, totalSegments, isPartialCoverage); + } + } + + // Build aggregated storage breakdown (divide by numReplica to avoid double counting) + StorageBreakdownInfo storageBreakdownInfo = null; + if (!tierAccum.isEmpty()) { + Map tiers = new HashMap<>(); + for (Map.Entry entry : tierAccum.entrySet()) { + int count = (int) (entry.getValue()[0] / numReplica); + long size = entry.getValue()[1] / numReplica; + tiers.put(entry.getKey(), new StorageBreakdownInfo.TierInfo(count, size)); + } + storageBreakdownInfo = new StorageBreakdownInfo(tiers); + } + + // When compression stats flag is OFF, suppress compressionStats and columnCompressionStats + // but always keep storageBreakdown (tier breakdown is independent of the compression stats flag) + if (!compressionStatsEnabled) { + columnCompressionStats = null; + compressionStatsSummary = null; + } + TableMetadataInfo aggregateTableMetadataInfo = new TableMetadataInfo(tableNameWithType, totalDiskSizeInBytes, totalNumSegments, totalNumRows, columnLengthMap, - columnCardinalityMap, maxNumMultiValuesMap, columnIndexSizeMap, partitionToServerPrimaryKeyCountMap); + columnCardinalityMap, maxNumMultiValuesMap, columnIndexSizeMap, partitionToServerPrimaryKeyCountMap, + columnCompressionStats, compressionStatsSummary, storageBreakdownInfo); if (failedParses != 0) { LOGGER.warn("Failed to parse {} / {} aggregated segment metadata responses from servers.", failedParses, serverUrls.size()); diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableMetadataReader.java b/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableMetadataReader.java index 628c917ff061..17665d0c94d4 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableMetadataReader.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableMetadataReader.java @@ -261,7 +261,7 @@ public JsonNode getSegmentMetadata(String tableNameWithType, String segmentName, * @return a map of segmentName to its metadata */ public JsonNode getAggregateTableMetadata(String tableNameWithType, List columns, int numReplica, - int timeoutMs) + int timeoutMs, boolean compressionStatsEnabled) throws InvalidConfigException { final Map> serverToSegments = _pinotHelixResourceManager.getServerToSegmentsMap(tableNameWithType); @@ -272,7 +272,7 @@ public JsonNode getAggregateTableMetadata(String tableNameWithType, List TableMetadataInfo aggregateTableMetadataInfo = serverSegmentMetadataReader.getAggregatedTableMetadataFromServer(tableNameWithType, endpoints, columns, - numReplica, timeoutMs); + numReplica, timeoutMs, compressionStatsEnabled); return JsonUtils.objectToJsonNode(aggregateTableMetadataInfo); } diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableSizeReader.java b/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableSizeReader.java index de9330289daa..1a8ef416e2a6 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableSizeReader.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/util/TableSizeReader.java @@ -19,14 +19,19 @@ package org.apache.pinot.controller.util; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.collect.BiMap; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executor; import javax.annotation.Nonnegative; import javax.annotation.Nullable; @@ -35,6 +40,7 @@ import org.apache.pinot.common.metadata.ZKMetadataProvider; import org.apache.pinot.common.metrics.ControllerGauge; import org.apache.pinot.common.metrics.ControllerMetrics; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; import org.apache.pinot.common.restlet.resources.SegmentSizeInfo; import org.apache.pinot.controller.LeadControllerManager; import org.apache.pinot.controller.api.resources.ServerTableSizeReader; @@ -57,6 +63,8 @@ public class TableSizeReader { private final PinotHelixResourceManager _helixResourceManager; private final ControllerMetrics _controllerMetrics; private final LeadControllerManager _leadControllerManager; + // Tracks emitted tier keys per table so stale tier gauges can be removed + private final Map> _emittedTierKeys = new ConcurrentHashMap<>(); public TableSizeReader(Executor executor, HttpClientConnectionManager connectionManager, ControllerMetrics controllerMetrics, PinotHelixResourceManager helixResourceManager, @@ -125,6 +133,14 @@ public TableSizeDetails getTableSizeDetails(String tableName, @Nonnegative int t if (largestSegmentSizeOnServer != DEFAULT_SIZE_WHEN_MISSING_OR_ERROR) { emitMetrics(realtimeTableName, ControllerGauge.LARGEST_SEGMENT_SIZE_ON_SERVER, largestSegmentSizeOnServer); } + emitTierMetrics(realtimeTableName, tableSizeDetails._realtimeSegments._storageBreakdown); + if (isCompressionStatsEnabled(realtimeTableConfig)) { + emitCompressionMetrics(realtimeTableName, tableSizeDetails._realtimeSegments); + } else { + clearCompressionMetrics(realtimeTableName); + tableSizeDetails._realtimeSegments._compressionStats = null; + tableSizeDetails._realtimeSegments._columnCompressionStats = null; + } } if (hasOfflineTableConfig) { String offlineTableName = TableNameBuilder.OFFLINE.tableNameWithType(tableName); @@ -151,6 +167,14 @@ public TableSizeDetails getTableSizeDetails(String tableName, @Nonnegative int t if (largestSegmentSizeOnServer != DEFAULT_SIZE_WHEN_MISSING_OR_ERROR) { emitMetrics(offlineTableName, ControllerGauge.LARGEST_SEGMENT_SIZE_ON_SERVER, largestSegmentSizeOnServer); } + emitTierMetrics(offlineTableName, tableSizeDetails._offlineSegments._storageBreakdown); + if (isCompressionStatsEnabled(offlineTableConfig)) { + emitCompressionMetrics(offlineTableName, tableSizeDetails._offlineSegments); + } else { + clearCompressionMetrics(offlineTableName); + tableSizeDetails._offlineSegments._compressionStats = null; + tableSizeDetails._offlineSegments._columnCompressionStats = null; + } } // Set the top level sizes to DEFAULT_SIZE_WHEN_MISSING_OR_ERROR when all segments are error @@ -164,12 +188,87 @@ public TableSizeDetails getTableSizeDetails(String tableName, @Nonnegative int t return tableSizeDetails; } + private void emitCompressionMetrics(String tableNameWithType, TableSubTypeSizeDetails subTypeDetails) { + CompressionStats stats = subTypeDetails._compressionStats; + if (stats != null && stats._segmentsWithStats > 0 && stats._compressedForwardIndexSizePerReplicaInBytes > 0) { + emitMetrics(tableNameWithType, ControllerGauge.TABLE_RAW_FORWARD_INDEX_SIZE_PER_REPLICA, + stats._rawForwardIndexSizePerReplicaInBytes); + emitMetrics(tableNameWithType, ControllerGauge.TABLE_COMPRESSED_FORWARD_INDEX_SIZE_PER_REPLICA, + stats._compressedForwardIndexSizePerReplicaInBytes); + // Emit ratio * 100 to preserve two decimal digits of precision as a long gauge + long ratioPercent = Math.round(stats._compressionRatio * 100); + emitMetrics(tableNameWithType, ControllerGauge.TABLE_COMPRESSION_RATIO_PERCENT, ratioPercent); + } else { + // No segments have stats — clear any previously emitted stale metrics + clearCompressionMetrics(tableNameWithType); + } + } + + private void emitTierMetrics(String tableNameWithType, @Nullable StorageBreakdown breakdown) { + Set currentTierKeys = new HashSet<>(); + if (breakdown != null && _leadControllerManager.isLeaderForTable(tableNameWithType)) { + for (Map.Entry tierEntry : breakdown._tiers.entrySet()) { + String tierKey = tierEntry.getKey(); + currentTierKeys.add(tierKey); + _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, tierKey, + ControllerGauge.TABLE_TIERED_STORAGE_SIZE, tierEntry.getValue()._sizePerReplicaInBytes); + } + } + // Remove gauges for tier keys that were emitted previously but are no longer present. + // Only track tables that actually have tiers to avoid unnecessary map entries. + Set previousTierKeys; + if (currentTierKeys.isEmpty()) { + previousTierKeys = _emittedTierKeys.remove(tableNameWithType); + } else { + previousTierKeys = _emittedTierKeys.put(tableNameWithType, currentTierKeys); + } + if (previousTierKeys != null) { + for (String oldKey : previousTierKeys) { + if (!currentTierKeys.contains(oldKey)) { + if (_leadControllerManager.isLeaderForTable(tableNameWithType)) { + _controllerMetrics.removeTableGauge(tableNameWithType, oldKey, + ControllerGauge.TABLE_TIERED_STORAGE_SIZE); + } + } + } + } + } + + private void clearCompressionMetrics(String tableNameWithType) { + if (_leadControllerManager.isLeaderForTable(tableNameWithType)) { + _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_RAW_FORWARD_INDEX_SIZE_PER_REPLICA); + _controllerMetrics.removeTableGauge(tableNameWithType, + ControllerGauge.TABLE_COMPRESSED_FORWARD_INDEX_SIZE_PER_REPLICA); + _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_COMPRESSION_RATIO_PERCENT); + } + } + + /** + * Removes all tier-specific gauges previously emitted for the given table. + * Called from SegmentStatusChecker.removeMetricsForTable during both leader and non-leader cleanup, + * so no leader check is applied here (the caller decides when cleanup is appropriate). + */ + public void clearTierMetrics(String tableNameWithType) { + Set previousTierKeys = _emittedTierKeys.remove(tableNameWithType); + if (previousTierKeys != null) { + for (String tierKey : previousTierKeys) { + _controllerMetrics.removeTableGauge(tableNameWithType, tierKey, + ControllerGauge.TABLE_TIERED_STORAGE_SIZE); + } + } + } + private void emitMetrics(String tableNameWithType, ControllerGauge controllerGauge, long value) { if (_leadControllerManager.isLeaderForTable(tableNameWithType)) { _controllerMetrics.setValueOfTableGauge(tableNameWithType, controllerGauge, value); } } + private static boolean isCompressionStatsEnabled(@Nullable TableConfig tableConfig) { + return tableConfig != null && tableConfig.getIndexingConfig() != null + && tableConfig.getIndexingConfig().isCompressionStatsEnabled(); + } + // // Reported size below indicates the sizes actually reported by servers on successful responses. // Estimated sizes indicates the size estimated size with approximated calculations for errored servers @@ -223,6 +322,21 @@ static public class TableSubTypeSizeDetails { @JsonProperty("reportedSizePerReplicaInBytes") public long _reportedSizePerReplicaInBytes = 0; + @Nullable + @JsonProperty("compressionStats") + @JsonInclude(JsonInclude.Include.NON_NULL) + public CompressionStats _compressionStats; + + @Nullable + @JsonProperty("columnCompressionStats") + @JsonInclude(JsonInclude.Include.NON_NULL) + public List _columnCompressionStats; + + @Nullable + @JsonProperty("storageBreakdown") + @JsonInclude(JsonInclude.Include.NON_NULL) + public StorageBreakdown _storageBreakdown; + @JsonProperty("segments") public Map _segments = new HashMap<>(); } @@ -243,6 +357,52 @@ static public class SegmentSizeDetails { public Map _serverInfo = new HashMap<>(); } + // Mutable accumulator used during per-server aggregation. Intentionally separate from the immutable + // CompressionStatsSummary DTO in pinot-common, which is only constructed once aggregation is complete. + @JsonIgnoreProperties(ignoreUnknown = true) + public static class CompressionStats { + @JsonProperty("rawForwardIndexSizePerReplicaInBytes") + public long _rawForwardIndexSizePerReplicaInBytes = 0; + + @JsonProperty("compressedForwardIndexSizePerReplicaInBytes") + public long _compressedForwardIndexSizePerReplicaInBytes = 0; + + @JsonProperty("compressionRatio") + public double _compressionRatio = 0; + + @JsonProperty("segmentsWithStats") + public int _segmentsWithStats = 0; + + @JsonProperty("totalSegments") + public int _totalSegments = 0; + + @JsonProperty("isPartialCoverage") + public boolean _isPartialCoverage = false; + } + + @JsonIgnoreProperties(ignoreUnknown = true) + public static class TierSizeInfo { + @JsonProperty("count") + public int _count = 0; + + @JsonProperty("sizePerReplicaInBytes") + public long _sizePerReplicaInBytes = 0; + + public TierSizeInfo() { + } + + public TierSizeInfo(int count, long sizePerReplicaInBytes) { + _count = count; + _sizePerReplicaInBytes = sizePerReplicaInBytes; + } + } + + @JsonIgnoreProperties(ignoreUnknown = true) + public static class StorageBreakdown { + @JsonProperty("tiers") + public Map _tiers = new HashMap<>(); + } + public TableSubTypeSizeDetails getTableSubtypeSize(String tableNameWithType, int timeoutMs, boolean includeReplacedSegments) throws InvalidConfigException { @@ -293,22 +453,65 @@ public TableSubTypeSizeDetails getTableSubtypeSize(String tableNameWithType, int // segments are not reflected in that count. Estimated size is what we estimate in case of // errors, as described above. // estimatedSize >= reportedSize. If no server reported error, estimatedSize == reportedSize + CompressionStats compressionStats = new CompressionStats(); + StorageBreakdown storageBreakdown = new StorageBreakdown(); + // Per-column aggregation: accumulate across segments (max across replicas per segment, sum across segments) + Map columnAccum = new HashMap<>(); // [rawSize, compressedSize] + Map columnCodecAgg = new HashMap<>(); + Map columnDictMap = new HashMap<>(); + Map> columnIndexesMap = new HashMap<>(); List missingSegments = new ArrayList<>(); for (Map.Entry entry : segmentToSizeDetailsMap.entrySet()) { String segment = entry.getKey(); SegmentSizeDetails sizeDetails = entry.getValue(); - // Iterate over all segment size info, update reported size, track max segment size and number of errored servers + // Iterate over all segment size info: update reported size, track max segment size, + // count errored servers, and track max raw/compressed forward index sizes across replicas. sizeDetails._maxReportedSizePerReplicaInBytes = DEFAULT_SIZE_WHEN_MISSING_OR_ERROR; int errors = 0; + long maxRawFwdIndexSize = 0; + long maxCompressedFwdIndexSize = 0; + String segmentTier = null; + // Track per-column max stats across replicas for this segment + Map perColumnMax = new HashMap<>(); // [rawSize, compressedSize] + Map perColumnCodec = new HashMap<>(); for (SegmentSizeInfo sizeInfo : sizeDetails._serverInfo.values()) { if (sizeInfo.getDiskSizeInBytes() != DEFAULT_SIZE_WHEN_MISSING_OR_ERROR) { sizeDetails._reportedSizeInBytes += sizeInfo.getDiskSizeInBytes(); sizeDetails._maxReportedSizePerReplicaInBytes = Math.max(sizeDetails._maxReportedSizePerReplicaInBytes, sizeInfo.getDiskSizeInBytes()); + if (sizeInfo.getRawForwardIndexSizeBytes() > 0) { + maxRawFwdIndexSize = Math.max(maxRawFwdIndexSize, sizeInfo.getRawForwardIndexSizeBytes()); + } + if (sizeInfo.getCompressedForwardIndexSizeBytes() > 0) { + maxCompressedFwdIndexSize = + Math.max(maxCompressedFwdIndexSize, sizeInfo.getCompressedForwardIndexSizeBytes()); + } + if (sizeInfo.getTier() != null) { + segmentTier = sizeInfo.getTier(); + } + // Track per-column stats (max across replicas) + Map colStats = sizeInfo.getColumnCompressionStats(); + if (colStats != null) { + for (Map.Entry colEntry : colStats.entrySet()) { + String colName = colEntry.getKey(); + ColumnCompressionStatsInfo colInfo = colEntry.getValue(); + long[] maxVals = perColumnMax.computeIfAbsent(colName, k -> new long[2]); + if (colInfo.getUncompressedSizeInBytes() > 0) { + maxVals[0] = Math.max(maxVals[0], colInfo.getUncompressedSizeInBytes()); + } + if (colInfo.getCompressedSizeInBytes() > 0) { + maxVals[1] = Math.max(maxVals[1], colInfo.getCompressedSizeInBytes()); + } + if (colInfo.getCodec() != null) { + perColumnCodec.put(colName, colInfo.getCodec()); + } + } + } } else { errors++; } } + // Update estimated size, track segments that are missing from all servers if (errors != sizeDetails._serverInfo.size()) { // Use max segment size from other servers to estimate the segment size not reported @@ -317,6 +520,47 @@ public TableSubTypeSizeDetails getTableSubtypeSize(String tableNameWithType, int subTypeSizeDetails._reportedSizeInBytes += sizeDetails._reportedSizeInBytes; subTypeSizeDetails._estimatedSizeInBytes += sizeDetails._estimatedSizeInBytes; subTypeSizeDetails._reportedSizePerReplicaInBytes += sizeDetails._maxReportedSizePerReplicaInBytes; + + // Aggregate forward index compression stats (per-replica max) + if (maxRawFwdIndexSize > 0 && maxCompressedFwdIndexSize > 0) { + compressionStats._rawForwardIndexSizePerReplicaInBytes += maxRawFwdIndexSize; + compressionStats._compressedForwardIndexSizePerReplicaInBytes += maxCompressedFwdIndexSize; + compressionStats._segmentsWithStats++; + } + + // Accumulate per-column compression stats across segments + for (Map.Entry colEntry : perColumnMax.entrySet()) { + String colName = colEntry.getKey(); + long[] maxVals = colEntry.getValue(); + long[] accum = columnAccum.computeIfAbsent(colName, k -> new long[2]); + accum[0] += maxVals[0]; + accum[1] += maxVals[1]; + String segmentCodec = perColumnCodec.get(colName); + if (segmentCodec != null) { + columnCodecAgg.merge(colName, segmentCodec, + (existing, incoming) -> existing.equals(incoming) ? existing : "MIXED"); + } + } + // Track per-column dictionary/indexes from per-segment server info + for (SegmentSizeInfo sizeInfo : sizeDetails._serverInfo.values()) { + Map colStats = sizeInfo.getColumnCompressionStats(); + if (colStats != null) { + for (Map.Entry colEntry : colStats.entrySet()) { + ColumnCompressionStatsInfo colInfo = colEntry.getValue(); + columnDictMap.putIfAbsent(colEntry.getKey(), colInfo.hasDictionary()); + if (colInfo.getIndexes() != null) { + columnIndexesMap.computeIfAbsent(colEntry.getKey(), k -> new LinkedHashSet<>()) + .addAll(colInfo.getIndexes()); + } + } + } + } + + // Aggregate tier-based storage breakdown + String tierKey = segmentTier != null ? segmentTier : "default"; + TierSizeInfo tierInfo = storageBreakdown._tiers.computeIfAbsent(tierKey, k -> new TierSizeInfo()); + tierInfo._count++; + tierInfo._sizePerReplicaInBytes += sizeDetails._maxReportedSizePerReplicaInBytes; } else { // Segment is missing from all servers missingSegments.add(segment); @@ -327,6 +571,44 @@ public TableSubTypeSizeDetails getTableSubtypeSize(String tableNameWithType, int } } + // Compute compression ratio and coverage stats + compressionStats._totalSegments = segmentToSizeDetailsMap.size(); + int nonMissingSegments = compressionStats._totalSegments - subTypeSizeDetails._missingSegments; + compressionStats._isPartialCoverage = compressionStats._segmentsWithStats < nonMissingSegments; + if (compressionStats._compressedForwardIndexSizePerReplicaInBytes > 0) { + compressionStats._compressionRatio = + (double) compressionStats._rawForwardIndexSizePerReplicaInBytes + / compressionStats._compressedForwardIndexSizePerReplicaInBytes; + } + // Build per-column compression stats list from accumulated data + List columnStatsList = null; + if (!columnAccum.isEmpty()) { + columnStatsList = new ArrayList<>(); + for (Map.Entry colEntry : columnAccum.entrySet()) { + String colName = colEntry.getKey(); + long[] accum = colEntry.getValue(); + boolean hasDictionary = Boolean.TRUE.equals(columnDictMap.get(colName)); + long uncompressed = (hasDictionary && accum[0] == 0) ? -1 : accum[0]; + long compressed = accum[1]; + double ratio = (uncompressed > 0 && compressed > 0) ? (double) uncompressed / compressed : 0; + Set indexSet = columnIndexesMap.get(colName); + List indexes = (indexSet != null && !indexSet.isEmpty()) ? new ArrayList<>(indexSet) : null; + columnStatsList.add(new ColumnCompressionStatsInfo(colName, uncompressed, compressed, ratio, + columnCodecAgg.get(colName), hasDictionary, indexes)); + } + columnStatsList.sort((a, b) -> a.getColumn().compareTo(b.getColumn())); + } + subTypeSizeDetails._columnCompressionStats = columnStatsList; + + // Suppress table-level compression stats when no segments have raw forward index data, + // but keep per-column stats (dict columns may still have valid forward index size data) + if (compressionStats._segmentsWithStats > 0) { + subTypeSizeDetails._compressionStats = compressionStats; + } else { + subTypeSizeDetails._compressionStats = null; + } + subTypeSizeDetails._storageBreakdown = storageBreakdown._tiers.isEmpty() ? null : storageBreakdown; + // Update metrics for missing segments if (subTypeSizeDetails._missingSegments > 0) { int numSegments = segmentToSizeDetailsMap.size(); diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/api/ServerTableSizeReaderRawBytesTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/api/ServerTableSizeReaderRawBytesTest.java new file mode 100644 index 000000000000..6b4dc305736a --- /dev/null +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/api/ServerTableSizeReaderRawBytesTest.java @@ -0,0 +1,194 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.controller.api; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.sun.net.httpserver.HttpHandler; +import com.sun.net.httpserver.HttpServer; +import java.io.IOException; +import java.io.OutputStream; +import java.net.InetSocketAddress; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; +import org.apache.pinot.common.restlet.resources.SegmentSizeInfo; +import org.apache.pinot.common.restlet.resources.TableSizeInfo; +import org.apache.pinot.controller.api.resources.ServerTableSizeReader; +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Tests that ServerTableSizeReader correctly deserializes SegmentSizeInfo with compression stats fields + * (rawForwardIndexSizeBytes, compressedForwardIndexSizeBytes, tier, columnCompressionStats). + */ +public class ServerTableSizeReaderRawBytesTest { + private static final String URI_PATH = "/table/"; + private static final int TIMEOUT_MSEC = 5000; + private static final int PORT_WITH_STATS = 11100; + private static final int PORT_WITHOUT_STATS = 11101; + private static final int PORT_ERROR = 11102; + + private final ExecutorService _executor = Executors.newFixedThreadPool(2); + private final PoolingHttpClientConnectionManager _connectionManager = new PoolingHttpClientConnectionManager(); + private HttpServer _serverWithStats; + private HttpServer _serverWithoutStats; + private HttpServer _serverError; + + @BeforeClass + public void setUp() + throws IOException { + // Server with compression stats + Map colStats = new HashMap<>(); + colStats.put("col_a", new ColumnCompressionStatsInfo("col_a", 10000, 2000, 5.0, "LZ4", false, + List.of("forward_index"))); + colStats.put("col_b", new ColumnCompressionStatsInfo("col_b", 20000, 5000, 4.0, "ZSTANDARD", false, + List.of("forward_index"))); + + List statsSegments = Arrays.asList( + new SegmentSizeInfo("s1", 50000, 30000, 7000, "default", colStats), + new SegmentSizeInfo("s2", 40000, 15000, 3000, "tier1", null)); + TableSizeInfo statsTable = new TableSizeInfo("testTable", 90000, statsSegments); + + _serverWithStats = startServer(PORT_WITH_STATS, createHandler(200, statsTable)); + + // Server without compression stats (backward compat) + List noStatsSegments = Arrays.asList(new SegmentSizeInfo("s3", 60000)); + TableSizeInfo noStatsTable = new TableSizeInfo("testTable", 60000, noStatsSegments); + _serverWithoutStats = startServer(PORT_WITHOUT_STATS, createHandler(200, noStatsTable)); + + // Server returning 500 + _serverError = startServer(PORT_ERROR, createHandler(500, null)); + } + + @AfterClass + public void tearDown() { + if (_serverWithStats != null) { + _serverWithStats.stop(0); + } + if (_serverWithoutStats != null) { + _serverWithoutStats.stop(0); + } + if (_serverError != null) { + _serverError.stop(0); + } + } + + @Test + public void testDeserializesNewFields() { + ServerTableSizeReader reader = new ServerTableSizeReader(_executor, _connectionManager); + BiMap endpoints = HashBiMap.create(); + endpoints.put("server0", "http://localhost:" + PORT_WITH_STATS); + + Map> result = + reader.getSegmentSizeInfoFromServers(endpoints, "testTable", TIMEOUT_MSEC); + assertEquals(result.size(), 1); + + List segments = result.get("server0"); + assertNotNull(segments); + assertEquals(segments.size(), 2); + + // s1 has compression stats + SegmentSizeInfo s1 = segments.get(0); + assertEquals(s1.getSegmentName(), "s1"); + assertEquals(s1.getDiskSizeInBytes(), 50000); + assertEquals(s1.getRawForwardIndexSizeBytes(), 30000); + assertEquals(s1.getCompressedForwardIndexSizeBytes(), 7000); + assertEquals(s1.getTier(), "default"); + + Map colStats = s1.getColumnCompressionStats(); + assertNotNull(colStats); + assertEquals(colStats.size(), 2); + assertEquals(colStats.get("col_a").getColumn(), "col_a"); + assertEquals(colStats.get("col_a").getUncompressedSizeInBytes(), 10000); + assertEquals(colStats.get("col_a").getCompressedSizeInBytes(), 2000); + assertEquals(colStats.get("col_a").getCompressionRatio(), 5.0, 0.01); + assertEquals(colStats.get("col_a").getCodec(), "LZ4"); + assertFalse(colStats.get("col_a").hasDictionary()); + + // s2 has tier but no column stats + SegmentSizeInfo s2 = segments.get(1); + assertEquals(s2.getTier(), "tier1"); + assertEquals(s2.getRawForwardIndexSizeBytes(), 15000); + } + + @Test + public void testBackwardCompatWithoutNewFields() { + ServerTableSizeReader reader = new ServerTableSizeReader(_executor, _connectionManager); + BiMap endpoints = HashBiMap.create(); + endpoints.put("server1", "http://localhost:" + PORT_WITHOUT_STATS); + + Map> result = + reader.getSegmentSizeInfoFromServers(endpoints, "testTable", TIMEOUT_MSEC); + assertEquals(result.size(), 1); + + List segments = result.get("server1"); + assertNotNull(segments); + assertEquals(segments.size(), 1); + + SegmentSizeInfo s3 = segments.get(0); + assertEquals(s3.getSegmentName(), "s3"); + assertEquals(s3.getDiskSizeInBytes(), 60000); + // Default values for missing fields (-1 indicates not available) + assertEquals(s3.getRawForwardIndexSizeBytes(), -1); + assertEquals(s3.getCompressedForwardIndexSizeBytes(), -1); + } + + @Test + public void testErrorServerExcluded() { + ServerTableSizeReader reader = new ServerTableSizeReader(_executor, _connectionManager); + BiMap endpoints = HashBiMap.create(); + endpoints.put("server0", "http://localhost:" + PORT_WITH_STATS); + endpoints.put("server_err", "http://localhost:" + PORT_ERROR); + + Map> result = + reader.getSegmentSizeInfoFromServers(endpoints, "testTable", TIMEOUT_MSEC); + // Error server should be excluded + assertTrue(result.containsKey("server0")); + assertFalse(result.containsKey("server_err")); + } + + private HttpHandler createHandler(int status, TableSizeInfo tableSize) { + return httpExchange -> { + String json = tableSize != null ? JsonUtils.objectToString(tableSize) : "error"; + httpExchange.sendResponseHeaders(status, json.length()); + OutputStream responseBody = httpExchange.getResponseBody(); + responseBody.write(json.getBytes()); + responseBody.close(); + }; + } + + private HttpServer startServer(int port, HttpHandler handler) + throws IOException { + HttpServer server = HttpServer.create(new InetSocketAddress(port), 0); + server.createContext(URI_PATH, handler); + new Thread(server::start).start(); + return server; + } +} diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/api/TableMetadataReaderCompressionTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/api/TableMetadataReaderCompressionTest.java new file mode 100644 index 000000000000..322792cfd36d --- /dev/null +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/api/TableMetadataReaderCompressionTest.java @@ -0,0 +1,302 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.controller.api; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.sun.net.httpserver.HttpHandler; +import com.sun.net.httpserver.HttpServer; +import java.io.IOException; +import java.io.OutputStream; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; +import org.apache.pinot.common.restlet.resources.CompressionStatsSummary; +import org.apache.pinot.common.restlet.resources.StorageBreakdownInfo; +import org.apache.pinot.common.restlet.resources.TableMetadataInfo; +import org.apache.pinot.controller.util.ServerSegmentMetadataReader; +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Tests that per-column compression stats are correctly aggregated across servers in the metadata endpoint + * (ServerSegmentMetadataReader.getAggregatedTableMetadataFromServer). + */ +public class TableMetadataReaderCompressionTest { + private static final int PORT_SERVER0 = 11200; + private static final int PORT_SERVER1 = 11201; + private static final int TIMEOUT_MSEC = 10000; + private static final int NUM_REPLICAS = 2; + + private final ExecutorService _executor = Executors.newFixedThreadPool(2); + private final PoolingHttpClientConnectionManager _connectionManager = new PoolingHttpClientConnectionManager(); + private HttpServer _httpServer0; + private HttpServer _httpServer1; + + @BeforeClass + public void setUp() + throws IOException { + // Server 0: has compression stats for col_a and col_b + List colStats0 = new ArrayList<>(); + colStats0.add(new ColumnCompressionStatsInfo("col_a", 10000, 2000, 5.0, "LZ4", false, + List.of("forward_index"))); + colStats0.add(new ColumnCompressionStatsInfo("col_b", 20000, 5000, 4.0, "ZSTANDARD", false, + List.of("forward_index", "inverted_index"))); + + TableMetadataInfo server0Info = new TableMetadataInfo("testTable_OFFLINE", 50000, 3, 1000, + Map.of("col_a", 4.0, "col_b", 100.0), + Map.of("col_a", 50.0, "col_b", 200.0), + Map.of(), Map.of(), Map.of(), colStats0); + + _httpServer0 = startServer(PORT_SERVER0, createHandler(server0Info)); + + // Server 1 (replica): same compression stats + List colStats1 = new ArrayList<>(); + colStats1.add(new ColumnCompressionStatsInfo("col_a", 10000, 2000, 5.0, "LZ4", false, + List.of("forward_index"))); + colStats1.add(new ColumnCompressionStatsInfo("col_b", 20000, 5000, 4.0, "ZSTANDARD", false, + List.of("forward_index", "inverted_index"))); + + TableMetadataInfo server1Info = new TableMetadataInfo("testTable_OFFLINE", 50000, 3, 1000, + Map.of("col_a", 4.0, "col_b", 100.0), + Map.of("col_a", 50.0, "col_b", 200.0), + Map.of(), Map.of(), Map.of(), colStats1); + + _httpServer1 = startServer(PORT_SERVER1, createHandler(server1Info)); + } + + @AfterClass + public void tearDown() { + if (_httpServer0 != null) { + _httpServer0.stop(0); + } + if (_httpServer1 != null) { + _httpServer1.stop(0); + } + } + + @Test + public void testColumnCompressionStatsAggregation() { + ServerSegmentMetadataReader reader = new ServerSegmentMetadataReader(_executor, _connectionManager); + BiMap endpoints = HashBiMap.create(); + endpoints.put("server0", "http://localhost:" + PORT_SERVER0); + endpoints.put("server1", "http://localhost:" + PORT_SERVER1); + + TableMetadataInfo result = reader.getAggregatedTableMetadataFromServer( + "testTable_OFFLINE", endpoints, null, NUM_REPLICAS, TIMEOUT_MSEC, true); + + assertNotNull(result); + // Disk size divided by replicas: (50000+50000) / 2 = 50000 + assertEquals(result.getDiskSizeInBytes(), 50000); + + // Per-column compression stats should be aggregated and divided by replicas + List colStats = result.getColumnCompressionStats(); + assertNotNull(colStats); + assertEquals(colStats.size(), 2); + + // Results are sorted by column name + ColumnCompressionStatsInfo colA = colStats.get(0); + assertNotNull(colA); + assertEquals(colA.getColumn(), "col_a"); + // (10000+10000)/2 = 10000 uncompressed, (2000+2000)/2 = 2000 compressed + assertEquals(colA.getUncompressedSizeInBytes(), 10000); + assertEquals(colA.getCompressedSizeInBytes(), 2000); + assertEquals(colA.getCompressionRatio(), 5.0, 0.01); + assertEquals(colA.getCodec(), "LZ4"); + assertFalse(colA.hasDictionary()); + + ColumnCompressionStatsInfo colB = colStats.get(1); + assertNotNull(colB); + assertEquals(colB.getColumn(), "col_b"); + // (20000+20000)/2 = 20000 uncompressed, (5000+5000)/2 = 5000 compressed + assertEquals(colB.getUncompressedSizeInBytes(), 20000); + assertEquals(colB.getCompressedSizeInBytes(), 5000); + assertEquals(colB.getCompressionRatio(), 4.0, 0.01); + assertEquals(colB.getCodec(), "ZSTANDARD"); + assertFalse(colB.hasDictionary()); + } + + @Test + public void testNoCompressionStatsFromServers() { + // Server with no compression stats (old server) + ServerSegmentMetadataReader reader = new ServerSegmentMetadataReader(_executor, _connectionManager); + + // Create a temporary server without compression stats + HttpServer noStatsServer = null; + try { + TableMetadataInfo noStatsInfo = new TableMetadataInfo("testTable_OFFLINE", 30000, 2, 500, + Map.of("col_a", 4.0), Map.of("col_a", 50.0), Map.of(), Map.of(), Map.of()); + noStatsServer = startServer(11210, createHandler(noStatsInfo)); + + BiMap endpoints = HashBiMap.create(); + endpoints.put("old_server", "http://localhost:11210"); + + TableMetadataInfo result = reader.getAggregatedTableMetadataFromServer( + "testTable_OFFLINE", endpoints, null, 1, TIMEOUT_MSEC, true); + + assertNotNull(result); + // No compression stats should result in null list + assertNull(result.getColumnCompressionStats()); + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + if (noStatsServer != null) { + noStatsServer.stop(0); + } + } + } + + @Test + public void testCompressionStatsSuppressedWhenFlagOff() { + ServerSegmentMetadataReader reader = new ServerSegmentMetadataReader(_executor, _connectionManager); + BiMap endpoints = HashBiMap.create(); + endpoints.put("server0", "http://localhost:" + PORT_SERVER0); + endpoints.put("server1", "http://localhost:" + PORT_SERVER1); + + // Flag OFF: compression stats and columnCompressionStats should be null, + // but storageBreakdown should still be preserved + TableMetadataInfo result = reader.getAggregatedTableMetadataFromServer( + "testTable_OFFLINE", endpoints, null, NUM_REPLICAS, TIMEOUT_MSEC, false); + + assertNotNull(result); + assertNull(result.getColumnCompressionStats(), "columnCompressionStats should be null when flag is OFF"); + assertNull(result.getCompressionStats(), "compressionStats should be null when flag is OFF"); + // storageBreakdown is always-on; it is null here only because test servers don't send it + } + + @Test + public void testCompressionSummaryAndStorageBreakdownAggregation() + throws IOException { + // Build a server response that includes CompressionStatsSummary and StorageBreakdownInfo + Map tiers = new HashMap<>(); + tiers.put("default", new StorageBreakdownInfo.TierInfo(3, 150000)); + tiers.put("cold", new StorageBreakdownInfo.TierInfo(1, 60000)); + StorageBreakdownInfo breakdown = new StorageBreakdownInfo(tiers); + + List colStats = new ArrayList<>(); + colStats.add(new ColumnCompressionStatsInfo("col_a", 20000, 4000, 5.0, "LZ4", false, null)); + + CompressionStatsSummary summary = new CompressionStatsSummary(20000, 4000, 5.0, 3, 3, false); + + TableMetadataInfo info = new TableMetadataInfo("testTable_OFFLINE", 200000, 4, 2000, + Map.of("col_a", 4.0), Map.of("col_a", 50.0), + Map.of(), Map.of(), Map.of(), colStats, summary, breakdown); + + HttpServer summaryServer = startServer(11215, createHandler(info)); + try { + ServerSegmentMetadataReader reader = new ServerSegmentMetadataReader(_executor, _connectionManager); + BiMap endpoints = HashBiMap.create(); + endpoints.put("srv", "http://localhost:11215"); + + TableMetadataInfo result = reader.getAggregatedTableMetadataFromServer( + "testTable_OFFLINE", endpoints, null, 1, TIMEOUT_MSEC, true); + + assertNotNull(result); + + // CompressionStatsSummary should be aggregated and returned + CompressionStatsSummary resultSummary = result.getCompressionStats(); + assertNotNull(resultSummary, "compressionStats should be aggregated from server response"); + assertEquals(resultSummary.getRawForwardIndexSizePerReplicaInBytes(), 20000); + assertEquals(resultSummary.getCompressedForwardIndexSizePerReplicaInBytes(), 4000); + assertEquals(resultSummary.getCompressionRatio(), 5.0, 0.01); + assertEquals(resultSummary.getSegmentsWithStats(), 3); + assertEquals(resultSummary.getTotalSegments(), 3); + assertFalse(resultSummary.isPartialCoverage()); + + // StorageBreakdownInfo should be aggregated and divided by numReplica (1 here) + StorageBreakdownInfo resultBreakdown = result.getStorageBreakdown(); + assertNotNull(resultBreakdown, "storageBreakdown should be aggregated from server response"); + assertNotNull(resultBreakdown.getTiers()); + assertEquals(resultBreakdown.getTiers().size(), 2); + StorageBreakdownInfo.TierInfo defaultTier = resultBreakdown.getTiers().get("default"); + assertNotNull(defaultTier); + assertEquals(defaultTier.getCount(), 3); + assertEquals(defaultTier.getSizePerReplicaInBytes(), 150000); + } finally { + summaryServer.stop(0); + } + } + + @Test + public void testDictColumnSentinelAndSkipPath() + throws IOException { + // Dict column: uncompressed=-1 sentinel, codec=null, hasDictionary=true → preserved + // Old raw column: uncompressed=0, codec=null, hasDictionary=false → skipped + List colStats = new ArrayList<>(); + colStats.add(new ColumnCompressionStatsInfo("dict_col", -1, 8000, 0.0, null, true, + List.of("forward_index"))); + colStats.add(new ColumnCompressionStatsInfo("old_raw_col", 0, 5000, 0.0, null, false, null)); + + TableMetadataInfo info = new TableMetadataInfo("testTable_OFFLINE", 100000, 2, 1000, + Map.of(), Map.of(), Map.of(), Map.of(), Map.of(), colStats); + + HttpServer server = startServer(11216, createHandler(info)); + try { + ServerSegmentMetadataReader reader = new ServerSegmentMetadataReader(_executor, _connectionManager); + BiMap endpoints = HashBiMap.create(); + endpoints.put("srv", "http://localhost:11216"); + + TableMetadataInfo result = reader.getAggregatedTableMetadataFromServer( + "testTable_OFFLINE", endpoints, null, 1, TIMEOUT_MSEC, true); + + assertNotNull(result); + List stats = result.getColumnCompressionStats(); + assertNotNull(stats); + // old_raw_col (codec=null, hasDictionary=false) must be skipped + assertEquals(stats.size(), 1); + ColumnCompressionStatsInfo dictColInfo = stats.get(0); + assertEquals(dictColInfo.getColumn(), "dict_col"); + // dict column: sentinel -1 preserved (not divided as 0) + assertEquals(dictColInfo.getUncompressedSizeInBytes(), -1); + assertEquals(dictColInfo.getCompressedSizeInBytes(), 8000); + assertTrue(dictColInfo.hasDictionary()); + } finally { + server.stop(0); + } + } + + private HttpHandler createHandler(TableMetadataInfo info) { + return httpExchange -> { + String json = JsonUtils.objectToString(info); + httpExchange.sendResponseHeaders(200, json.length()); + OutputStream responseBody = httpExchange.getResponseBody(); + responseBody.write(json.getBytes()); + responseBody.close(); + }; + } + + private HttpServer startServer(int port, HttpHandler handler) + throws IOException { + HttpServer server = HttpServer.create(new InetSocketAddress(port), 0); + server.createContext("/tables/", handler); + new Thread(server::start).start(); + return server; + } +} diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/api/TableSizeReaderCompressionStatsTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/api/TableSizeReaderCompressionStatsTest.java new file mode 100644 index 000000000000..481cbda7e642 --- /dev/null +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/api/TableSizeReaderCompressionStatsTest.java @@ -0,0 +1,355 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.controller.api; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.sun.net.httpserver.HttpHandler; +import java.io.IOException; +import java.io.OutputStream; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.io.HttpClientConnectionManager; +import org.apache.helix.AccessOption; +import org.apache.helix.store.zk.ZkHelixPropertyStore; +import org.apache.pinot.common.exception.InvalidConfigException; +import org.apache.pinot.common.metrics.ControllerGauge; +import org.apache.pinot.common.metrics.ControllerMetrics; +import org.apache.pinot.common.metrics.MetricValueUtils; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; +import org.apache.pinot.common.restlet.resources.SegmentSizeInfo; +import org.apache.pinot.common.restlet.resources.TableSizeInfo; +import org.apache.pinot.common.utils.config.TableConfigSerDeUtils; +import org.apache.pinot.controller.LeadControllerManager; +import org.apache.pinot.controller.helix.core.PinotHelixResourceManager; +import org.apache.pinot.controller.util.TableSizeReader; +import org.apache.pinot.controller.utils.FakeHttpServer; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.metrics.PinotMetricUtils; +import org.apache.pinot.spi.utils.JsonUtils; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.apache.pinot.spi.utils.builder.TableNameBuilder; +import org.mockito.ArgumentMatchers; +import org.mockito.stubbing.Answer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.Mockito.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.*; + + +/** + * Tests compression stats aggregation in TableSizeReader. + */ +public class TableSizeReaderCompressionStatsTest { + private static final String URI_PATH = "/table/"; + private static final int TIMEOUT_MSEC = 10000; + private static final int NUM_REPLICAS = 2; + + private final Executor _executor = Executors.newFixedThreadPool(1); + private final HttpClientConnectionManager _connectionManager = new PoolingHttpClientConnectionManager(); + private final ControllerMetrics _controllerMetrics = + new ControllerMetrics(PinotMetricUtils.getPinotMetricsRegistry()); + private final Map _serverMap = new HashMap<>(); + private PinotHelixResourceManager _helix; + private LeadControllerManager _leadControllerManager; + + @BeforeClass + public void setUp() + throws IOException { + _helix = mock(PinotHelixResourceManager.class); + _leadControllerManager = mock(LeadControllerManager.class); + + TableConfig tableConfig = + new TableConfigBuilder(TableType.OFFLINE).setTableName("compressionTable").setNumReplicas(NUM_REPLICAS).build(); + tableConfig.getIndexingConfig().setCompressionStatsEnabled(true); + + TableConfig flagOffTableConfig = + new TableConfigBuilder(TableType.OFFLINE).setTableName("flagOffTable").setNumReplicas(NUM_REPLICAS).build(); + // compressionStatsEnabled defaults to false — do NOT enable it + + ZkHelixPropertyStore mockPropertyStore = mock(ZkHelixPropertyStore.class); + + when(mockPropertyStore.get(ArgumentMatchers.anyString(), ArgumentMatchers.eq(null), + ArgumentMatchers.eq(AccessOption.PERSISTENT))).thenAnswer((Answer) invocationOnMock -> { + String path = (String) invocationOnMock.getArguments()[0]; + if (path.contains("offline_OFFLINE")) { + return TableConfigSerDeUtils.toZNRecord(tableConfig); + } + if (path.contains("flagOffTable_OFFLINE")) { + return TableConfigSerDeUtils.toZNRecord(flagOffTableConfig); + } + return null; + }); + + when(_helix.getPropertyStore()).thenReturn(mockPropertyStore); + when(_helix.getNumReplicas(any(TableConfig.class))).thenReturn(NUM_REPLICAS); + when(_leadControllerManager.isLeaderForTable(anyString())).thenReturn(true); + + // server0: segment s1 and s2 with compression stats + Map s1ColStats = new HashMap<>(); + s1ColStats.put("col_a", new ColumnCompressionStatsInfo("col_a", 10000, 2000, 5.0, "LZ4", false, null)); + s1ColStats.put("col_b", new ColumnCompressionStatsInfo("col_b", 20000, 5000, 4.0, "ZSTANDARD", false, null)); + + Map s2ColStats = new HashMap<>(); + s2ColStats.put("col_a", new ColumnCompressionStatsInfo("col_a", 15000, 3000, 5.0, "LZ4", false, null)); + + List server0Sizes = Arrays.asList( + new SegmentSizeInfo("s1", 50000, 30000, 7000, "default", s1ColStats), + new SegmentSizeInfo("s2", 40000, 15000, 3000, "default", s2ColStats)); + FakeCompressionServer s0 = new FakeCompressionServer(Arrays.asList("s1", "s2"), server0Sizes); + s0.start(URI_PATH, createHandler(200, server0Sizes)); + _serverMap.put("server0", s0); + + // server1: segment s1 and s2 (replica) with same stats + List server1Sizes = Arrays.asList( + new SegmentSizeInfo("s1", 50000, 30000, 7000, "default", s1ColStats), + new SegmentSizeInfo("s2", 40000, 15000, 3000, "default", s2ColStats)); + FakeCompressionServer s1 = new FakeCompressionServer(Arrays.asList("s1", "s2"), server1Sizes); + s1.start(URI_PATH, createHandler(200, server1Sizes)); + _serverMap.put("server1", s1); + + // server2: segment s3 without compression stats (old server) + List server2Sizes = Arrays.asList(new SegmentSizeInfo("s3", 60000)); + FakeCompressionServer s2 = new FakeCompressionServer(Arrays.asList("s3"), server2Sizes); + s2.start(URI_PATH, createHandler(200, server2Sizes)); + _serverMap.put("server2", s2); + } + + @AfterClass + public void tearDown() { + for (FakeCompressionServer server : _serverMap.values()) { + server.stop(); + } + } + + private HttpHandler createHandler(int status, List segmentSizes) { + return httpExchange -> { + long tableSizeInBytes = 0; + for (SegmentSizeInfo segmentSize : segmentSizes) { + tableSizeInBytes += segmentSize.getDiskSizeInBytes(); + } + TableSizeInfo tableInfo = new TableSizeInfo("compressionTable", tableSizeInBytes, segmentSizes); + String json = JsonUtils.objectToString(tableInfo); + httpExchange.sendResponseHeaders(status, json.length()); + OutputStream responseBody = httpExchange.getResponseBody(); + responseBody.write(json.getBytes()); + responseBody.close(); + }; + } + + private static class FakeCompressionServer extends FakeHttpServer { + final List _segments; + final List _sizes; + + FakeCompressionServer(List segments, List sizes) { + _segments = segments; + _sizes = sizes; + } + } + + private TableSizeReader.TableSizeDetails testRunner(String[] servers, String table) + throws InvalidConfigException { + when(_helix.getServerToSegmentsMap(anyString(), any(), anyBoolean())).thenAnswer( + (Answer>>) invocation -> { + Map> map = new HashMap<>(); + for (String server : servers) { + map.put(server, _serverMap.get(server)._segments); + } + return map; + }); + + when(_helix.getDataInstanceAdminEndpoints(ArgumentMatchers.anySet())).thenAnswer( + (Answer>) invocation -> { + BiMap endpoints = HashBiMap.create(servers.length); + for (String server : servers) { + endpoints.put(server, _serverMap.get(server)._endpoint); + } + return endpoints; + }); + + TableSizeReader reader = + new TableSizeReader(_executor, _connectionManager, _controllerMetrics, _helix, _leadControllerManager); + return reader.getTableSizeDetails(table, TIMEOUT_MSEC, true); + } + + @Test + public void testCompressionStatsAggregation() + throws InvalidConfigException { + String[] servers = {"server0", "server1"}; + TableSizeReader.TableSizeDetails details = testRunner(servers, "offline"); + + TableSizeReader.TableSubTypeSizeDetails offlineDetails = details._offlineSegments; + assertNotNull(offlineDetails); + + // s1: rawFwdIdx=30000, compressedFwdIdx=7000 (max across replicas) + // s2: rawFwdIdx=15000, compressedFwdIdx=3000 (max across replicas) + // Total per replica: raw=45000, compressed=10000 + TableSizeReader.CompressionStats cs = offlineDetails._compressionStats; + assertNotNull(cs); + assertEquals(cs._rawForwardIndexSizePerReplicaInBytes, 45000); + assertEquals(cs._compressedForwardIndexSizePerReplicaInBytes, 10000); + + // Compression ratio = 45000 / 10000 = 4.5 + assertEquals(cs._compressionRatio, 4.5, 0.01); + + // Both segments have stats + assertEquals(cs._segmentsWithStats, 2); + assertEquals(cs._totalSegments, 2); + assertFalse(cs._isPartialCoverage); + + // Verify compression metrics emitted + String tableNameWithType = TableNameBuilder.OFFLINE.tableNameWithType("offline"); + assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableNameWithType, + ControllerGauge.TABLE_RAW_FORWARD_INDEX_SIZE_PER_REPLICA), 45000); + assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableNameWithType, + ControllerGauge.TABLE_COMPRESSED_FORWARD_INDEX_SIZE_PER_REPLICA), 10000); + assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableNameWithType, + ControllerGauge.TABLE_COMPRESSION_RATIO_PERCENT), 450); + + // Verify per-column compression stats aggregation (now top-level on TableSubTypeSizeDetails) + // s1: col_a(raw=10000, compressed=2000), col_b(raw=20000, compressed=5000) + // s2: col_a(raw=15000, compressed=3000) + // Aggregated: col_a: raw=10000+15000=25000, compressed=2000+3000=5000 + // col_b: raw=20000, compressed=5000 + List colStats = offlineDetails._columnCompressionStats; + assertNotNull(colStats, "Per-column compression stats should be present"); + assertFalse(colStats.isEmpty(), "Per-column compression stats should not be empty"); + // List is sorted by column name: col_a, col_b + ColumnCompressionStatsInfo colA = colStats.get(0); + assertEquals(colA.getColumn(), "col_a"); + assertEquals(colA.getUncompressedSizeInBytes(), 25000); + assertEquals(colA.getCompressedSizeInBytes(), 5000); + assertEquals(colA.getCompressionRatio(), 5.0, 0.01); + assertEquals(colA.getCodec(), "LZ4"); + + ColumnCompressionStatsInfo colB = colStats.get(1); + assertEquals(colB.getColumn(), "col_b"); + assertEquals(colB.getUncompressedSizeInBytes(), 20000); + assertEquals(colB.getCompressedSizeInBytes(), 5000); + assertEquals(colB.getCompressionRatio(), 4.0, 0.01); + assertEquals(colB.getCodec(), "ZSTANDARD"); + + // Verify storageBreakdown is present + assertNotNull(offlineDetails._storageBreakdown); + assertFalse(offlineDetails._storageBreakdown._tiers.isEmpty()); + TableSizeReader.TierSizeInfo defaultTier = offlineDetails._storageBreakdown._tiers.get("default"); + assertNotNull(defaultTier, "default tier should be present"); + assertEquals(defaultTier._count, 2, "Should have 2 segments in default tier"); + assertTrue(defaultTier._sizePerReplicaInBytes > 0, "Tier size should be > 0"); + } + + @Test + public void testPartialCompressionCoverage() + throws InvalidConfigException { + // Mix of servers with and without compression stats + String[] servers = {"server0", "server1", "server2"}; + TableSizeReader.TableSizeDetails details = testRunner(servers, "offline"); + + TableSizeReader.TableSubTypeSizeDetails offlineDetails = details._offlineSegments; + assertNotNull(offlineDetails); + + // s1 and s2 have compression stats, s3 does not + TableSizeReader.CompressionStats cs = offlineDetails._compressionStats; + assertNotNull(cs); + assertEquals(cs._segmentsWithStats, 2); + assertEquals(cs._totalSegments, 3); + assertTrue(cs._isPartialCoverage); + + // Compression ratio still computed from segments that have stats + assertEquals(cs._rawForwardIndexSizePerReplicaInBytes, 45000); + assertEquals(cs._compressedForwardIndexSizePerReplicaInBytes, 10000); + assertEquals(cs._compressionRatio, 4.5, 0.01); + } + + @Test + public void testNoCompressionStats() + throws InvalidConfigException { + // Only old server without compression stats — compressionStats should be null + // since no segments have stats and no per-column stats exist + String[] servers = {"server2"}; + TableSizeReader.TableSizeDetails details = testRunner(servers, "offline"); + + TableSizeReader.TableSubTypeSizeDetails offlineDetails = details._offlineSegments; + assertNotNull(offlineDetails); + + assertNull(offlineDetails._compressionStats, + "compressionStats should be null when no segments have compression data"); + } + + @Test + public void testStaleMetricsClearedWhenNoStats() + throws InvalidConfigException { + // First run with servers that have stats to emit metrics + String[] serversWithStats = {"server0", "server1"}; + testRunner(serversWithStats, "offline"); + + String tableNameWithType = TableNameBuilder.OFFLINE.tableNameWithType("offline"); + // Verify metrics were emitted + assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableNameWithType, + ControllerGauge.TABLE_COMPRESSION_RATIO_PERCENT), 450); + + // Now run with only old server (no stats) — stale metrics should be cleared + String[] serversNoStats = {"server2"}; + testRunner(serversNoStats, "offline"); + + // Metrics should be cleared (0 means removed) + assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableNameWithType, + ControllerGauge.TABLE_COMPRESSION_RATIO_PERCENT), 0); + } + + @Test + public void testCompressionStatsNullWhenFlagOff() + throws InvalidConfigException { + // Use servers with compression stats but with compressionStatsEnabled=false on the table config + String[] servers = {"server0", "server1"}; + TableSizeReader.TableSizeDetails details = testRunner(servers, "flagOffTable"); + + TableSizeReader.TableSubTypeSizeDetails offlineDetails = details._offlineSegments; + assertNotNull(offlineDetails); + + // compressionStats should be null when the flag is OFF (suppressed from JSON via @JsonInclude NON_NULL) + assertNull(offlineDetails._compressionStats, + "compressionStats should be null when compressionStatsEnabled is false"); + + // columnCompressionStats should also be null when flag is OFF + assertNull(offlineDetails._columnCompressionStats, + "columnCompressionStats should be null when compressionStatsEnabled is false"); + + // storageBreakdown is always reported regardless of compression flag + assertNotNull(offlineDetails._storageBreakdown, + "storageBreakdown should still be present when compressionStatsEnabled is false"); + + // Verify no compression metrics were emitted for this table + String tableNameWithType = TableNameBuilder.OFFLINE.tableNameWithType("flagOffTable"); + assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableNameWithType, + ControllerGauge.TABLE_COMPRESSION_RATIO_PERCENT), 0); + } +} diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/CompressionStatsOfflineIngestionIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/CompressionStatsOfflineIngestionIntegrationTest.java new file mode 100644 index 000000000000..b88800851c94 --- /dev/null +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/CompressionStatsOfflineIngestionIntegrationTest.java @@ -0,0 +1,329 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.integration.tests; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.config.table.IndexingConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.JsonUtils; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.apache.pinot.util.TestUtils; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Integration test that validates compression stats tracking end-to-end for offline batch ingestion. + * + *

Creates an offline table with {@code compressionStatsEnabled=true}, ingests data from Avro files + * with several raw (no-dictionary) columns using LZ4 compression, and then verifies that the + * controller's {@code GET /tables/{table}/size} API response includes valid compression statistics: + * raw forward index sizes, compressed forward index sizes, compression ratio, and segment coverage. + */ +public class CompressionStatsOfflineIngestionIntegrationTest extends BaseClusterIntegrationTest { + private static final int NUM_BROKERS = 1; + private static final int NUM_SERVERS = 1; + + // Raw columns that will have compression stats tracked. + // These are metric/dimension columns from the default On_Time schema that support raw encoding. + private static final List RAW_COLUMNS = + List.of("ActualElapsedTime", "ArrDelay", "DepDelay", "CRSDepTime"); + + @Override + protected String getTableName() { + return "compressionStatsOfflineTest"; + } + + @Override + protected long getCountStarResult() { + return DEFAULT_COUNT_STAR_RESULT; + } + + @Override + protected List getNoDictionaryColumns() { + return new ArrayList<>(RAW_COLUMNS); + } + + @Override + protected List getFieldConfigs() { + List fieldConfigs = new ArrayList<>(); + for (String column : RAW_COLUMNS) { + fieldConfigs.add( + new FieldConfig(column, FieldConfig.EncodingType.RAW, List.of(), + FieldConfig.CompressionCodec.LZ4, null)); + } + return fieldConfigs; + } + + @Override + protected TableConfig createOfflineTableConfig() { + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(getTableName()) + .setTimeColumnName(getTimeColumnName()) + .setNoDictionaryColumns(getNoDictionaryColumns()) + .setFieldConfigList(getFieldConfigs()) + .setNumReplicas(getNumReplicas()) + .build(); + + // Enable compression stats tracking + IndexingConfig indexingConfig = tableConfig.getIndexingConfig(); + indexingConfig.setCompressionStatsEnabled(true); + + return tableConfig; + } + + @BeforeClass + public void setUp() + throws Exception { + TestUtils.ensureDirectoriesExistAndEmpty(_tempDir, _segmentDir, _tarDir); + + // Start the Pinot cluster + startZk(); + startController(); + startBroker(); + startServer(); + + // Create and upload the schema and table config + Schema schema = createSchema(); + addSchema(schema); + TableConfig tableConfig = createOfflineTableConfig(); + addTableConfig(tableConfig); + + // Unpack Avro data, build segments, and upload + List avroFiles = unpackAvroData(_tempDir); + ClusterIntegrationTestUtils.buildSegmentsFromAvro(avroFiles, tableConfig, schema, 0, _segmentDir, _tarDir); + uploadSegments(getTableName(), _tarDir); + + // Wait for all documents to be loaded + waitForAllDocsLoaded(600_000L); + } + + @AfterClass + public void tearDown() + throws Exception { + String offlineTableName = + org.apache.pinot.spi.utils.builder.TableNameBuilder.OFFLINE.tableNameWithType(getTableName()); + dropOfflineTable(getTableName()); + waitForTableDataManagerRemoved(offlineTableName); + waitForEVToDisappear(offlineTableName); + stopServer(); + stopBroker(); + stopController(); + stopZk(); + FileUtils.deleteQuietly(_tempDir); + } + + @Test + public void testCompressionStatsInTableSizeApi() + throws Exception { + // Call the controller table size API + String response = sendGetRequest( + controllerUrl("/tables/" + getTableName() + "/size")); + JsonNode tableSizeJson = JsonUtils.stringToJsonNode(response); + + // Verify top-level structure + assertNotNull(tableSizeJson.get("tableName"), "Response should have tableName"); + assertTrue(tableSizeJson.get("reportedSizeInBytes").asLong() > 0, + "reportedSizeInBytes should be > 0"); + + // Get offline segment details + JsonNode offlineSegments = tableSizeJson.get("offlineSegments"); + assertNotNull(offlineSegments, "offlineSegments should be present"); + + // Verify compression stats are nested under compressionStats object + JsonNode compressionStatsNode = offlineSegments.get("compressionStats"); + assertNotNull(compressionStatsNode, "compressionStats should be present"); + + long rawFwdIndexSize = compressionStatsNode.get("rawForwardIndexSizePerReplicaInBytes").asLong(); + long compressedFwdIndexSize = compressionStatsNode.get("compressedForwardIndexSizePerReplicaInBytes").asLong(); + double compressionRatio = compressionStatsNode.get("compressionRatio").asDouble(); + int segmentsWithStats = compressionStatsNode.get("segmentsWithStats").asInt(); + int totalSegments = compressionStatsNode.get("totalSegments").asInt(); + boolean isPartialCoverage = compressionStatsNode.get("isPartialCoverage").asBoolean(); + + // Raw forward index size should be > 0 (we have 4 raw columns across 12 segments) + assertTrue(rawFwdIndexSize > 0, + "rawForwardIndexSizePerReplicaInBytes should be > 0, got: " + rawFwdIndexSize); + + // Compressed forward index size should be > 0 + assertTrue(compressedFwdIndexSize > 0, + "compressedForwardIndexSizePerReplicaInBytes should be > 0, got: " + compressedFwdIndexSize); + + // Compression ratio should be > 0 (raw / compressed) + assertTrue(compressionRatio > 0, + "compressionRatio should be > 0, got: " + compressionRatio); + + // Raw size should be >= compressed size (compression should not expand data for numeric columns) + assertTrue(rawFwdIndexSize >= compressedFwdIndexSize, + "rawForwardIndexSize (" + rawFwdIndexSize + ") should be >= compressedForwardIndexSize (" + + compressedFwdIndexSize + ")"); + + // Compression ratio = raw / compressed, should be >= 1.0 + assertTrue(compressionRatio >= 1.0, + "compressionRatio should be >= 1.0, got: " + compressionRatio); + + // All 12 segments should have compression stats since compressionStatsEnabled=true + assertEquals(totalSegments, 12, "totalSegments should be 12"); + assertEquals(segmentsWithStats, 12, + "segmentsWithStats should equal totalSegments (all segments built with stats enabled)"); + + // No partial coverage since all segments have stats + assertFalse(isPartialCoverage, + "isPartialCoverage should be false when all segments have stats"); + } + + @Test + public void testPerSegmentCompressionStats() + throws Exception { + // Call table size API with verbose=true (default) to get per-segment details + String response = sendGetRequest( + controllerUrl("/tables/" + getTableName() + "/size?verbose=true")); + JsonNode tableSizeJson = JsonUtils.stringToJsonNode(response); + + JsonNode offlineSegments = tableSizeJson.get("offlineSegments"); + JsonNode segments = offlineSegments.get("segments"); + assertNotNull(segments, "segments map should be present in verbose response"); + + // Each segment should have server info with compression stats + int segmentsChecked = 0; + var fieldNames = segments.fieldNames(); + while (fieldNames.hasNext()) { + String segmentName = fieldNames.next(); + JsonNode segmentDetails = segments.get(segmentName); + JsonNode serverInfo = segmentDetails.get("serverInfo"); + assertNotNull(serverInfo, "serverInfo should be present for segment: " + segmentName); + + // Check each server's response for this segment + var serverNames = serverInfo.fieldNames(); + while (serverNames.hasNext()) { + String serverName = serverNames.next(); + JsonNode sizeInfo = serverInfo.get(serverName); + long diskSize = sizeInfo.get("diskSizeInBytes").asLong(); + if (diskSize > 0) { + // Segment should have raw and compressed forward index sizes + long rawSize = sizeInfo.get("rawForwardIndexSizeBytes").asLong(); + long compressedSize = sizeInfo.get("compressedForwardIndexSizeBytes").asLong(); + assertTrue(rawSize > 0, + "rawForwardIndexSizeBytes should be > 0 for segment " + segmentName); + assertTrue(compressedSize > 0, + "compressedForwardIndexSizeBytes should be > 0 for segment " + segmentName); + + // Verify per-column compression stats if present + JsonNode columnStats = sizeInfo.get("columnCompressionStats"); + if (columnStats != null && !columnStats.isNull()) { + // At least some of our RAW_COLUMNS should appear + int columnsWithStats = 0; + for (String col : RAW_COLUMNS) { + if (columnStats.has(col)) { + JsonNode colInfo = columnStats.get(col); + assertTrue(colInfo.get("uncompressedSizeInBytes").asLong() > 0, + "Per-column uncompressed size should be > 0 for " + col); + assertTrue(colInfo.get("compressedSizeInBytes").asLong() > 0, + "Per-column compressed size should be > 0 for " + col); + assertTrue(colInfo.get("compressionRatio").asDouble() > 0, + "Per-column compression ratio should be > 0 for " + col); + assertEquals(colInfo.get("codec").asText(), "LZ4", + "Compression codec should be LZ4 for " + col); + assertFalse(colInfo.get("hasDictionary").asBoolean(), + "Raw column should not have dictionary for " + col); + columnsWithStats++; + } + } + assertTrue(columnsWithStats > 0, + "At least one raw column should have compression stats in segment " + segmentName); + } + } + } + segmentsChecked++; + } + assertTrue(segmentsChecked > 0, "Should have checked at least one segment"); + } + + @Test + public void testCompressionStatsDisabledTable() + throws Exception { + // Create a second table WITHOUT compressionStatsEnabled + String noStatsTableName = "compressionStatsDisabledTest"; + Schema schema = createSchema(); + schema.setSchemaName(noStatsTableName); + addSchema(schema); + + TableConfig noStatsConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(noStatsTableName) + .setTimeColumnName(getTimeColumnName()) + .setNoDictionaryColumns(getNoDictionaryColumns()) + .setFieldConfigList(getFieldConfigs()) + .setNumReplicas(getNumReplicas()) + .build(); + // compressionStatsEnabled defaults to false — do NOT set it + + addTableConfig(noStatsConfig); + + // Build and upload segments for the no-stats table + File noStatsSegmentDir = new File(_tempDir, "noStatsSegmentDir"); + File noStatsTarDir = new File(_tempDir, "noStatsTarDir"); + TestUtils.ensureDirectoriesExistAndEmpty(noStatsSegmentDir, noStatsTarDir); + + List avroFiles = unpackAvroData(_tempDir); + ClusterIntegrationTestUtils.buildSegmentsFromAvro(avroFiles, noStatsConfig, schema, 0, + noStatsSegmentDir, noStatsTarDir); + uploadSegments(noStatsTableName, noStatsTarDir); + + // Wait for docs to load + TestUtils.waitForCondition(aVoid -> { + try { + return getCurrentCountStarResult(noStatsTableName) == DEFAULT_COUNT_STAR_RESULT; + } catch (Exception e) { + return false; + } + }, 600_000L, "Failed to load documents for no-stats table"); + + try { + // Query table size + String response = sendGetRequest( + controllerUrl("/tables/" + noStatsTableName + "/size")); + JsonNode tableSizeJson = JsonUtils.stringToJsonNode(response); + + JsonNode offlineSegments = tableSizeJson.get("offlineSegments"); + assertNotNull(offlineSegments); + + // compressionStats should be absent (null/suppressed) since compressionStatsEnabled was false + JsonNode compressionStatsNode = offlineSegments.get("compressionStats"); + assertNull(compressionStatsNode, + "compressionStats should be absent when compressionStatsEnabled is false"); + + // storageBreakdown should still be present (always reported regardless of flag) + JsonNode storageBreakdown = offlineSegments.get("storageBreakdown"); + assertNotNull(storageBreakdown, "storageBreakdown should be present even when flag is off"); + } finally { + // Clean up the second table even if assertions fail + dropOfflineTable(noStatsTableName); + } + } +} diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/CompressionStatsRealtimeIngestionIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/CompressionStatsRealtimeIngestionIntegrationTest.java new file mode 100644 index 000000000000..db5863ae2885 --- /dev/null +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/CompressionStatsRealtimeIngestionIntegrationTest.java @@ -0,0 +1,244 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.integration.tests; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.config.table.IndexingConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.JsonUtils; +import org.apache.pinot.spi.utils.builder.TableNameBuilder; +import org.apache.pinot.util.TestUtils; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Integration test that validates compression stats tracking end-to-end for realtime (Kafka) ingestion. + * + *

Creates a realtime table with {@code compressionStatsEnabled=true}, pushes data from Avro files + * into Kafka with several raw (no-dictionary) columns using LZ4 compression, waits for all documents + * to be consumed, and then verifies that the controller's {@code GET /tables/{table}/size} API response + * includes valid compression statistics for the completed (COMPLETED) segments. + */ +public class CompressionStatsRealtimeIngestionIntegrationTest extends BaseClusterIntegrationTestSet { + + // Raw columns that will have compression stats tracked. + // These are metric/dimension columns from the default On_Time schema that support raw encoding. + private static final List RAW_COLUMNS = + List.of("ActualElapsedTime", "ArrDelay", "DepDelay", "CRSDepTime"); + + @Override + protected String getTableName() { + return "compressionStatsRealtimeTest"; + } + + @Override + protected long getCountStarResult() { + return DEFAULT_COUNT_STAR_RESULT; + } + + @Override + protected List getNoDictionaryColumns() { + return new ArrayList<>(RAW_COLUMNS); + } + + @Override + protected List getFieldConfigs() { + List fieldConfigs = new ArrayList<>(); + for (String column : RAW_COLUMNS) { + fieldConfigs.add( + new FieldConfig(column, FieldConfig.EncodingType.RAW, List.of(), + FieldConfig.CompressionCodec.LZ4, null)); + } + return fieldConfigs; + } + + @Override + protected TableConfig createRealtimeTableConfig(File sampleAvroFile) { + TableConfig tableConfig = super.createRealtimeTableConfig(sampleAvroFile); + + // Enable compression stats tracking + IndexingConfig indexingConfig = tableConfig.getIndexingConfig(); + indexingConfig.setCompressionStatsEnabled(true); + + return tableConfig; + } + + @BeforeClass + public void setUp() + throws Exception { + TestUtils.ensureDirectoriesExistAndEmpty(_tempDir); + + // Start the Pinot cluster + startZk(); + startKafka(); + startController(); + startBroker(); + startServer(); + + // Unpack the Avro files + List avroFiles = unpackAvroData(_tempDir); + + // Create and upload the schema and table config + Schema schema = createSchema(); + addSchema(schema); + TableConfig tableConfig = createRealtimeTableConfig(avroFiles.get(0)); + addTableConfig(tableConfig); + waitForAllRealtimePartitionsConsuming( + TableNameBuilder.REALTIME.tableNameWithType(getTableName()), 120_000L); + + // Push data into Kafka + pushAvroIntoKafka(avroFiles); + + // Wait for all documents to be loaded + waitForAllDocsLoaded(600_000L); + } + + @AfterClass + public void tearDown() + throws Exception { + dropRealtimeTable(getTableName()); + waitForTableDataManagerRemoved(TableNameBuilder.REALTIME.tableNameWithType(getTableName())); + waitForEVToDisappear(TableNameBuilder.REALTIME.tableNameWithType(getTableName())); + stopServer(); + stopBroker(); + stopController(); + stopKafka(); + stopZk(); + FileUtils.deleteDirectory(_tempDir); + } + + @Test + public void testCompressionStatsInTableSizeApiForRealtimeTable() + throws Exception { + // Call the controller table size API + String response = sendGetRequest( + controllerUrl("/tables/" + getTableName() + "/size")); + JsonNode tableSizeJson = JsonUtils.stringToJsonNode(response); + + // Verify top-level structure + assertNotNull(tableSizeJson.get("tableName"), "Response should have tableName"); + assertTrue(tableSizeJson.get("reportedSizeInBytes").asLong() >= 0, + "reportedSizeInBytes should be >= 0"); + + // Get realtime segment details + JsonNode realtimeSegments = tableSizeJson.get("realtimeSegments"); + assertNotNull(realtimeSegments, "realtimeSegments should be present"); + + // Verify compression stats are nested under compressionStats object + JsonNode compressionStatsNode = realtimeSegments.get("compressionStats"); + assertNotNull(compressionStatsNode, "compressionStats should be present"); + assertTrue(compressionStatsNode.has("rawForwardIndexSizePerReplicaInBytes"), + "compressionStats should have rawForwardIndexSizePerReplicaInBytes"); + assertTrue(compressionStatsNode.has("compressedForwardIndexSizePerReplicaInBytes"), + "compressionStats should have compressedForwardIndexSizePerReplicaInBytes"); + assertTrue(compressionStatsNode.has("compressionRatio"), + "compressionStats should have compressionRatio"); + assertTrue(compressionStatsNode.has("segmentsWithStats"), + "compressionStats should have segmentsWithStats"); + assertTrue(compressionStatsNode.has("totalSegments"), + "compressionStats should have totalSegments"); + + long rawFwdIndexSize = compressionStatsNode.get("rawForwardIndexSizePerReplicaInBytes").asLong(); + long compressedFwdIndexSize = compressionStatsNode.get("compressedForwardIndexSizePerReplicaInBytes").asLong(); + double compressionRatio = compressionStatsNode.get("compressionRatio").asDouble(); + int segmentsWithStats = compressionStatsNode.get("segmentsWithStats").asInt(); + int totalSegments = compressionStatsNode.get("totalSegments").asInt(); + + // Total segments should be > 0 (at least consuming segments exist) + assertTrue(totalSegments > 0, + "totalSegments should be > 0, got: " + totalSegments); + + // Segments with stats: completed segments built with compressionStatsEnabled=true should have stats. + // Consuming segments may or may not have stats depending on whether they've been committed. + // We just verify the fields are present and non-negative. + assertTrue(segmentsWithStats >= 0, + "segmentsWithStats should be >= 0, got: " + segmentsWithStats); + + // If any completed segments exist with stats, verify the compression data makes sense + if (segmentsWithStats > 0) { + assertTrue(rawFwdIndexSize > 0, + "rawForwardIndexSizePerReplicaInBytes should be > 0 when segments have stats, got: " + + rawFwdIndexSize); + assertTrue(compressedFwdIndexSize > 0, + "compressedForwardIndexSizePerReplicaInBytes should be > 0 when segments have stats, got: " + + compressedFwdIndexSize); + assertTrue(compressionRatio > 0, + "compressionRatio should be > 0 when segments have stats, got: " + compressionRatio); + assertTrue(rawFwdIndexSize >= compressedFwdIndexSize, + "rawForwardIndexSize (" + rawFwdIndexSize + ") should be >= compressedForwardIndexSize (" + + compressedFwdIndexSize + ")"); + assertTrue(compressionRatio >= 1.0, + "compressionRatio should be >= 1.0, got: " + compressionRatio); + } + } + + @Test + public void testPerSegmentCompressionStatsForRealtimeTable() + throws Exception { + // Call table size API with verbose=true to get per-segment details + String response = sendGetRequest( + controllerUrl("/tables/" + getTableName() + "/size?verbose=true")); + JsonNode tableSizeJson = JsonUtils.stringToJsonNode(response); + + JsonNode realtimeSegments = tableSizeJson.get("realtimeSegments"); + assertNotNull(realtimeSegments, "realtimeSegments should be present"); + + JsonNode segments = realtimeSegments.get("segments"); + assertNotNull(segments, "segments map should be present in verbose response"); + + // At least one segment should exist + assertTrue(segments.size() > 0, "Should have at least one segment"); + + // Iterate segments and validate structure + int segmentsChecked = 0; + var fieldNames = segments.fieldNames(); + while (fieldNames.hasNext()) { + String segmentName = fieldNames.next(); + JsonNode segmentDetails = segments.get(segmentName); + JsonNode serverInfo = segmentDetails.get("serverInfo"); + assertNotNull(serverInfo, "serverInfo should be present for segment: " + segmentName); + + var serverNames = serverInfo.fieldNames(); + while (serverNames.hasNext()) { + String serverName = serverNames.next(); + JsonNode sizeInfo = serverInfo.get(serverName); + long diskSize = sizeInfo.get("diskSizeInBytes").asLong(); + if (diskSize > 0) { + // Verify compression stats fields exist in each server's response + assertTrue(sizeInfo.has("rawForwardIndexSizeBytes"), + "Server info should have rawForwardIndexSizeBytes for segment " + segmentName); + assertTrue(sizeInfo.has("compressedForwardIndexSizeBytes"), + "Server info should have compressedForwardIndexSizeBytes for segment " + segmentName); + } + } + segmentsChecked++; + } + assertTrue(segmentsChecked > 0, "Should have checked at least one segment"); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java index 8b3d22aef406..b724d6c9e860 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java @@ -69,6 +69,8 @@ public abstract class BaseChunkForwardIndexWriter implements Closeable { protected int _chunkSize; protected long _dataOffset; + protected long _uncompressedSize; + protected boolean _trackUncompressedSize = true; private final int _headerEntryChunkOffsetSize; @@ -196,4 +198,15 @@ protected void writeChunk() { _dataOffset += sizeToWrite; _chunkBuffer.clear(); } + + /** + * Returns the total uncompressed size of data written so far. + */ + public long getUncompressedSize() { + return _uncompressedSize; + } + + public void setTrackUncompressedSize(boolean trackUncompressedSize) { + _trackUncompressedSize = trackUncompressedSize; + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java index 8b517a84f9c1..c98468a51a71 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java @@ -54,24 +54,36 @@ public FixedByteChunkForwardIndexWriter(File file, ChunkCompressionType compress } public void putInt(int value) { + if (_trackUncompressedSize) { + _uncompressedSize += Integer.BYTES; + } _chunkBuffer.putInt(value); _chunkDataOffset += Integer.BYTES; flushChunkIfNeeded(); } public void putLong(long value) { + if (_trackUncompressedSize) { + _uncompressedSize += Long.BYTES; + } _chunkBuffer.putLong(value); _chunkDataOffset += Long.BYTES; flushChunkIfNeeded(); } public void putFloat(float value) { + if (_trackUncompressedSize) { + _uncompressedSize += Float.BYTES; + } _chunkBuffer.putFloat(value); _chunkDataOffset += Float.BYTES; flushChunkIfNeeded(); } public void putDouble(double value) { + if (_trackUncompressedSize) { + _uncompressedSize += Double.BYTES; + } _chunkBuffer.putDouble(value); _chunkDataOffset += Double.BYTES; flushChunkIfNeeded(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriter.java index f04481dc04eb..2bad71832815 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriter.java @@ -88,6 +88,9 @@ public void putString(String value) { @Override public void putBytes(byte[] value) { + if (_trackUncompressedSize) { + _uncompressedSize += value.length; + } _chunkBuffer.putInt(_chunkHeaderOffset, _chunkDataOffSet); _chunkHeaderOffset += CHUNK_HEADER_ENTRY_ROW_OFFSET_SIZE; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index c6b30c038482..9cc4555c5df2 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -92,6 +92,8 @@ public class VarByteChunkForwardIndexWriterV4 implements VarByteChunkWriter { private int _nextDocId = 0; private int _metadataSize = 0; private long _chunkOffset = 0; + private long _uncompressedSize = 0; + private boolean _trackUncompressedSize = true; public VarByteChunkForwardIndexWriterV4(File file, ChunkCompressionType compressionType, int chunkSize) throws IOException { @@ -138,6 +140,9 @@ public void putString(String string) { public void putBytes(byte[] bytes) { Preconditions.checkState(_chunkOffset < (1L << 32), "exceeded 4GB of compressed chunks for: " + _dataBuffer.getName()); + if (_trackUncompressedSize) { + _uncompressedSize += bytes.length; + } int sizeRequired = Integer.BYTES + bytes.length; if (_chunkBuffer.position() > _chunkBuffer.capacity() - sizeRequired) { flushChunk(); @@ -332,4 +337,15 @@ public void close() FileUtils.deleteQuietly(_dataBuffer); _chunkCompressor.close(); } + + /** + * Returns the total uncompressed size of data written so far. + */ + public long getUncompressedSize() { + return _uncompressedSize; + } + + public void setTrackUncompressedSize(boolean trackUncompressedSize) { + _trackUncompressedSize = trackUncompressedSize; + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkWriter.java index 80739ca63d53..38acc2a20978 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkWriter.java @@ -42,4 +42,17 @@ public interface VarByteChunkWriter extends Closeable { void putStringMV(String[] values); void putBytesMV(byte[][] values); + + /** + * Returns the total uncompressed size of data written so far. + */ + default long getUncompressedSize() { + return 0; + } + + /** + * Controls whether the writer tracks uncompressed data size. + */ + default void setTrackUncompressedSize(boolean trackUncompressedSize) { + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java index 12d9cf88bdcb..4b77e523b140 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/BaseSegmentCreator.java @@ -48,11 +48,13 @@ import org.apache.pinot.segment.local.segment.index.converter.SegmentFormatConverterFactory; import org.apache.pinot.segment.local.segment.index.dictionary.DictionaryIndexPlugin; import org.apache.pinot.segment.local.segment.index.dictionary.DictionaryIndexType; +import org.apache.pinot.segment.local.segment.index.forward.ForwardIndexType; import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig; import org.apache.pinot.segment.local.segment.index.loader.invertedindex.MultiColumnTextIndexHandler; import org.apache.pinot.segment.local.startree.v2.builder.MultipleTreesBuilder; import org.apache.pinot.segment.local.utils.CrcUtils; import org.apache.pinot.segment.spi.V1Constants; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.converter.SegmentFormatConverter; import org.apache.pinot.segment.spi.creator.ColumnStatistics; import org.apache.pinot.segment.spi.creator.IndexCreationContext; @@ -199,6 +201,7 @@ private IndexCreationContext.Common getIndexCreationContext(FieldSpec fieldSpec, .withMutableSegmentCompacted(_config.isMutableSegmentCompacted()) .withMutableToImmutableDocIdMap(_config.getMutableToImmutableDocIdMap()) .withContinueOnError(_config.isContinueOnError()) + .withCompressionStatsEnabled(_config.isCompressionStatsEnabled()) .build(); } @@ -555,6 +558,38 @@ protected void writeMetadata() hasDictionary, dictionaryElementSize, fwdConfig.getEncodingType(), false); } + // Persist compression stats if enabled + if (_config.isCompressionStatsEnabled()) { + Map indexConfigs = _config.getIndexConfigsByColName(); + for (Map.Entry entry : _colIndexes.entrySet()) { + String column = entry.getKey(); + ColumnIndexCreators colCreators = entry.getValue(); + ForwardIndexCreator fwdCreator = colCreators.getForwardIndexCreator(); + if (fwdCreator != null && !fwdCreator.isDictionaryEncoded()) { + long uncompressedSize = fwdCreator.getUncompressedSize(); + if (uncompressedSize > 0) { + properties.setProperty( + V1Constants.MetadataKeys.Column.getKeyFor(column, + V1Constants.MetadataKeys.Column.FORWARD_INDEX_UNCOMPRESSED_SIZE), + String.valueOf(uncompressedSize)); + } + FieldIndexConfigs fieldIndexConfigs = indexConfigs.get(column); + if (fieldIndexConfigs != null) { + ForwardIndexConfig fwdConfig = fieldIndexConfigs.getConfig(StandardIndexes.forward()); + FieldSpec fieldSpec = _schema.getFieldSpecFor(column); + ChunkCompressionType compressionType = ForwardIndexType.resolveCompressionType( + fwdConfig, fieldSpec != null ? fieldSpec.getFieldType() : null); + if (compressionType != null) { + properties.setProperty( + V1Constants.MetadataKeys.Column.getKeyFor(column, + V1Constants.MetadataKeys.Column.FORWARD_INDEX_COMPRESSION_CODEC), + compressionType.name()); + } + } + } + } + } + SegmentZKPropsConfig segmentZKPropsConfig = _config.getSegmentZKPropsConfig(); if (segmentZKPropsConfig != null) { properties.setProperty(Realtime.START_OFFSET, segmentZKPropsConfig.getStartOffset()); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/ColumnIndexCreators.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/ColumnIndexCreators.java index 0ef96b7b17b4..75aa0e4233d7 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/ColumnIndexCreators.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/ColumnIndexCreators.java @@ -27,6 +27,7 @@ import org.apache.pinot.segment.local.segment.creator.impl.nullvalue.NullValueVectorCreator; import org.apache.pinot.segment.spi.index.FieldIndexConfigs; import org.apache.pinot.segment.spi.index.IndexCreator; +import org.apache.pinot.segment.spi.index.creator.ForwardIndexCreator; import org.apache.pinot.spi.data.FieldSpec; @@ -92,6 +93,19 @@ public FieldIndexConfigs getIndexConfigs() { return _indexConfigs; } + /** + * Returns the ForwardIndexCreator for this column, or null if not found. + */ + @Nullable + public ForwardIndexCreator getForwardIndexCreator() { + for (IndexCreator creator : _indexCreators) { + if (creator instanceof ForwardIndexCreator) { + return (ForwardIndexCreator) creator; + } + } + return null; + } + public void seal() throws IOException { if (_isSealed) { return; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV2.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV2.java index b8ff2a103e85..6b82691ab15d 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV2.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV2.java @@ -470,6 +470,48 @@ public void close() _dataFile.close(); } + /** + * Returns the total uncompressed size across all CLP sub-streams (logtype IDs, dictionary variable + * IDs, encoded variables, and raw fallback messages). This represents the pre-compression + * sub-stream byte total, not the original UTF-8 message length, because CLP encodes strings into + * typed sub-columns before compression. The ratio of this value to the compressed forward index + * size reflects how effectively the final compression stage operates on CLP's intermediate + * representation. + */ + @Override + public long getUncompressedSize() { + long total = 0; + if (_logtypeIdFwdIndex != null) { + total += _logtypeIdFwdIndex.getUncompressedSize(); + } + if (_dictVarIdFwdIndex != null) { + total += _dictVarIdFwdIndex.getUncompressedSize(); + } + if (_encodedVarFwdIndex != null) { + total += _encodedVarFwdIndex.getUncompressedSize(); + } + if (_rawMsgFwdIndex != null) { + total += _rawMsgFwdIndex.getUncompressedSize(); + } + return total; + } + + @Override + public void setTrackUncompressedSize(boolean trackUncompressedSize) { + if (_logtypeIdFwdIndex != null) { + _logtypeIdFwdIndex.setTrackUncompressedSize(trackUncompressedSize); + } + if (_dictVarIdFwdIndex != null) { + _dictVarIdFwdIndex.setTrackUncompressedSize(trackUncompressedSize); + } + if (_encodedVarFwdIndex != null) { + _encodedVarFwdIndex.setTrackUncompressedSize(trackUncompressedSize); + } + if (_rawMsgFwdIndex != null) { + _rawMsgFwdIndex.setTrackUncompressedSize(trackUncompressedSize); + } + } + @Override public boolean isDictionaryEncoded() { return false; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java index feddb9b8ece6..cf0e4b8acc6f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueFixedByteRawIndexCreator.java @@ -147,4 +147,14 @@ public void close() throws IOException { _indexWriter.close(); } + + @Override + public long getUncompressedSize() { + return _indexWriter.getUncompressedSize(); + } + + @Override + public void setTrackUncompressedSize(boolean trackUncompressedSize) { + _indexWriter.setTrackUncompressedSize(trackUncompressedSize); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueVarByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueVarByteRawIndexCreator.java index 01e345d6068e..31a323270ac9 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueVarByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/MultiValueVarByteRawIndexCreator.java @@ -134,6 +134,16 @@ public void close() _indexWriter.close(); } + @Override + public long getUncompressedSize() { + return _indexWriter.getUncompressedSize(); + } + + @Override + public void setTrackUncompressedSize(boolean trackUncompressedSize) { + _indexWriter.setTrackUncompressedSize(trackUncompressedSize); + } + /** * The actual content in an MV array is prepended with 2 prefixes: * 1. elementLengthStoragePrefixInBytes - bytes required to store the length of each element in the largest array diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java index 453519c8a691..1993f25d1f8c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java @@ -114,4 +114,14 @@ public void close() throws IOException { _indexWriter.close(); } + + @Override + public long getUncompressedSize() { + return _indexWriter.getUncompressedSize(); + } + + @Override + public void setTrackUncompressedSize(boolean trackUncompressedSize) { + _indexWriter.setTrackUncompressedSize(trackUncompressedSize); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java index 69028d4a2447..056328570cbf 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueVarByteRawIndexCreator.java @@ -137,4 +137,14 @@ public void close() throws IOException { _indexWriter.close(); } + + @Override + public long getUncompressedSize() { + return _indexWriter.getUncompressedSize(); + } + + @Override + public void setTrackUncompressedSize(boolean trackUncompressedSize) { + _indexWriter.setTrackUncompressedSize(trackUncompressedSize); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java index eaf669be34ef..0aeb4d06e15b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java @@ -73,37 +73,38 @@ public static ForwardIndexCreator createIndexCreator(IndexCreationContext contex } else { // Raw forward index DataType storedType = fieldSpec.getDataType().getStoredType(); + ForwardIndexCreator creator; if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLP) { // CLP (V1) uses hard-coded chunk compressor which is set to `PassThrough` - return new CLPForwardIndexCreatorV1(indexDir, columnName, numTotalDocs, context.getColumnStatistics()); - } - if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLPV2) { + creator = new CLPForwardIndexCreatorV1(indexDir, columnName, numTotalDocs, context.getColumnStatistics()); + } else if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLPV2) { // Use the default chunk compression codec for CLP, currently configured to use ZStandard - return new CLPForwardIndexCreatorV2(indexDir, context.getColumnStatistics()); - } - if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLPV2_ZSTD) { - return new CLPForwardIndexCreatorV2(indexDir, context.getColumnStatistics(), ChunkCompressionType.ZSTANDARD); - } - if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLPV2_LZ4) { - return new CLPForwardIndexCreatorV2(indexDir, context.getColumnStatistics(), ChunkCompressionType.LZ4); - } - ChunkCompressionType chunkCompressionType = indexConfig.getChunkCompressionType(); - if (chunkCompressionType == null) { - chunkCompressionType = ForwardIndexType.getDefaultCompressionType(fieldSpec.getFieldType()); - } - boolean deriveNumDocsPerChunk = indexConfig.isDeriveNumDocsPerChunk(); - int writerVersion = indexConfig.getRawIndexWriterVersion(); - int targetMaxChunkSize = indexConfig.getTargetMaxChunkSizeBytes(); - int targetDocsPerChunk = indexConfig.getTargetDocsPerChunk(); - if (fieldSpec.isSingleValueField()) { - return getRawIndexCreatorForSVColumn(indexDir, chunkCompressionType, columnName, storedType, numTotalDocs, - context.getLengthOfLongestElement(), deriveNumDocsPerChunk, writerVersion, targetMaxChunkSize, - targetDocsPerChunk); + creator = new CLPForwardIndexCreatorV2(indexDir, context.getColumnStatistics()); + } else if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLPV2_ZSTD) { + creator = new CLPForwardIndexCreatorV2(indexDir, context.getColumnStatistics(), ChunkCompressionType.ZSTANDARD); + } else if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLPV2_LZ4) { + creator = new CLPForwardIndexCreatorV2(indexDir, context.getColumnStatistics(), ChunkCompressionType.LZ4); } else { - return getRawIndexCreatorForMVColumn(indexDir, chunkCompressionType, columnName, storedType, numTotalDocs, - context.getMaxNumberOfMultiValues(), deriveNumDocsPerChunk, writerVersion, - context.getMaxRowLengthInBytes(), targetMaxChunkSize, targetDocsPerChunk); + ChunkCompressionType chunkCompressionType = indexConfig.getChunkCompressionType(); + if (chunkCompressionType == null) { + chunkCompressionType = ForwardIndexType.getDefaultCompressionType(fieldSpec.getFieldType()); + } + boolean deriveNumDocsPerChunk = indexConfig.isDeriveNumDocsPerChunk(); + int writerVersion = indexConfig.getRawIndexWriterVersion(); + int targetMaxChunkSize = indexConfig.getTargetMaxChunkSizeBytes(); + int targetDocsPerChunk = indexConfig.getTargetDocsPerChunk(); + if (fieldSpec.isSingleValueField()) { + creator = getRawIndexCreatorForSVColumn(indexDir, chunkCompressionType, columnName, storedType, numTotalDocs, + context.getLengthOfLongestElement(), deriveNumDocsPerChunk, writerVersion, targetMaxChunkSize, + targetDocsPerChunk); + } else { + creator = getRawIndexCreatorForMVColumn(indexDir, chunkCompressionType, columnName, storedType, numTotalDocs, + context.getMaxNumberOfMultiValues(), deriveNumDocsPerChunk, writerVersion, + context.getMaxRowLengthInBytes(), targetMaxChunkSize, targetDocsPerChunk); + } } + creator.setTrackUncompressedSize(context.isCompressionStatsEnabled()); + return creator; } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index 8136a82297f4..9bde0fd6ace3 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -296,6 +296,36 @@ public static ChunkCompressionType getDefaultCompressionType(FieldSpec.FieldType } } + /** + * Resolves the actual chunk compression type for a forward index config, handling CLP codec variants + * that use internal compression different from what ForwardIndexConfig.getChunkCompressionType() reports. + * Falls back to getChunkCompressionType(), then to the field-type default. + * + * @param fwdConfig the forward index configuration + * @param fieldType the field type, may be {@code null} if the field spec is unavailable (e.g. schema evolution); + * when null the field-type default fallback is skipped and the method may return {@code null} + * @return the resolved compression type, or {@code null} if it cannot be determined + */ + public static ChunkCompressionType resolveCompressionType(ForwardIndexConfig fwdConfig, + @Nullable FieldSpec.FieldType fieldType) { + FieldConfig.CompressionCodec codec = fwdConfig.getCompressionCodec(); + if (codec != null) { + switch (codec) { + case CLP: + return ChunkCompressionType.PASS_THROUGH; + case CLPV2: + case CLPV2_ZSTD: + return ChunkCompressionType.ZSTANDARD; + case CLPV2_LZ4: + return ChunkCompressionType.LZ4; + default: + break; + } + } + ChunkCompressionType type = fwdConfig.getChunkCompressionType(); + return type != null ? type : (fieldType != null ? getDefaultCompressionType(fieldType) : null); + } + @Override public ForwardIndexCreator createIndexCreator(IndexCreationContext context, ForwardIndexConfig indexConfig) throws Exception { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java index e0f2e200fd0b..a8b1eafb4da8 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java @@ -43,6 +43,7 @@ import org.apache.pinot.segment.local.segment.creator.impl.stats.NoDictColumnStatisticsCollector; import org.apache.pinot.segment.local.segment.creator.impl.stats.StringColumnPreIndexStatsCollector; import org.apache.pinot.segment.local.segment.index.dictionary.DictionaryIndexType; +import org.apache.pinot.segment.local.segment.index.forward.ForwardIndexType; import org.apache.pinot.segment.local.segment.readers.PinotSegmentColumnReader; import org.apache.pinot.segment.local.utils.ClusterConfigForTable; import org.apache.pinot.segment.spi.ColumnMetadata; @@ -597,12 +598,26 @@ private void rewriteForwardIndexForCompressionChange(String column, SegmentDirec segmentWriter.removeIndex(column, StandardIndexes.forward()); LoaderUtils.writeIndexToV3Format(segmentWriter, column, fwdIndexFile, StandardIndexes.forward()); + // Persist the new compression codec in metadata.properties (only when compression stats are enabled) + if (_tableConfig.getIndexingConfig().isCompressionStatsEnabled()) { + ForwardIndexConfig newConfig = _fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward()); + ChunkCompressionType compressionType = + ForwardIndexType.resolveCompressionType(newConfig, columnMetadata.getFieldSpec().getFieldType()); + if (compressionType != null) { + Map metadataProperties = new HashMap<>(); + metadataProperties.put(getKeyFor(column, FORWARD_INDEX_COMPRESSION_CODEC), compressionType.name()); + SegmentMetadataUtils.updateMetadataProperties(_segmentDirectory, metadataProperties); + } + } + // Delete the marker file. FileUtils.deleteQuietly(inProgress); LOGGER.info("Created forward index for segment: {}, column: {}", segmentName, column); } + + private void forwardIndexRewriteHelper(String column, ColumnMetadata existingColumnMetadata, ForwardIndexReader reader, ForwardIndexCreator creator, int numDocs, @Nullable SegmentDictionaryCreator dictionaryCreator, @Nullable Dictionary dictionaryReader) { @@ -983,6 +998,9 @@ private void createDictBasedForwardIndex(String column, SegmentDirectory.Writer metadataProperties.put(getKeyFor(column, CARDINALITY), String.valueOf(cardinality)); metadataProperties.put(getKeyFor(column, BITS_PER_ELEMENT), String.valueOf(PinotDataBitSet.getNumBitsPerValue(cardinality - 1))); + // Clear stale compression stats that were set when the column was raw-encoded + metadataProperties.put(getKeyFor(column, FORWARD_INDEX_COMPRESSION_CODEC), null); + metadataProperties.put(getKeyFor(column, FORWARD_INDEX_UNCOMPRESSED_SIZE), null); SegmentMetadataUtils.updateMetadataProperties(_segmentDirectory, metadataProperties); // We remove indexes that have to be rewritten when a dictEnabled is toggled. Note that the respective index @@ -1097,7 +1115,7 @@ private void disableDictionaryAndCreateRawForwardIndex(String column, SegmentDir } LOGGER.info("Creating raw forward index for segment={} and column={}", segmentName, column); - rewriteDictToRawForwardIndex(existingColMetadata, segmentWriter, indexDir); + long uncompressedSize = rewriteDictToRawForwardIndex(existingColMetadata, segmentWriter, indexDir); // Remove dictionary and forward index segmentWriter.removeIndex(column, StandardIndexes.forward()); @@ -1113,6 +1131,18 @@ private void disableDictionaryAndCreateRawForwardIndex(String column, SegmentDir // TODO: See https://github.com/apache/pinot/pull/16921 for details // TODO: Remove the property after 1.6.0 release // metadataProperties.put(getKeyFor(column, BITS_PER_ELEMENT), null); + if (_tableConfig.getIndexingConfig().isCompressionStatsEnabled()) { + ForwardIndexConfig fwdConfig = _fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward()); + ChunkCompressionType compressionType = + ForwardIndexType.resolveCompressionType(fwdConfig, existingColMetadata.getFieldSpec().getFieldType()); + if (compressionType != null) { + metadataProperties.put(getKeyFor(column, FORWARD_INDEX_COMPRESSION_CODEC), compressionType.name()); + } + if (uncompressedSize > 0) { + metadataProperties.put(getKeyFor(column, FORWARD_INDEX_UNCOMPRESSED_SIZE), + String.valueOf(uncompressedSize)); + } + } SegmentMetadataUtils.updateMetadataProperties(_segmentDirectory, metadataProperties); // Remove range index, inverted index and FST index. @@ -1165,7 +1195,11 @@ private void convertDictForwardToRawKeepingDictionary(String column, SegmentDire LOGGER.info("Converted forward index to raw (dictionary kept) for segment: {}, column: {}", segmentName, column); } - private void rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, SegmentDirectory.Writer segmentWriter, + /** + * Rewrites a dictionary-encoded forward index as a raw forward index. + * @return the uncompressed size of the new raw forward index, or 0 if not tracked + */ + private long rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, SegmentDirectory.Writer segmentWriter, File indexDir) throws Exception { String column = columnMetadata.getColumnName(); @@ -1173,11 +1207,14 @@ private void rewriteDictToRawForwardIndex(ColumnMetadata columnMetadata, Segment try (ForwardIndexReader forwardIndex = StandardIndexes.forward().getReaderFactory() .createIndexReader(segmentWriter, indexConfigs, columnMetadata); Dictionary dictionary = DictionaryIndexType.read(segmentWriter, columnMetadata)) { - IndexCreationContext context = new IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata).build(); + IndexCreationContext context = new IndexCreationContext.Builder(indexDir, _tableConfig, columnMetadata) + .withCompressionStatsEnabled(_tableConfig.getIndexingConfig().isCompressionStatsEnabled()) + .build(); ForwardIndexConfig config = indexConfigs.getConfig(StandardIndexes.forward()); try (ForwardIndexCreator creator = StandardIndexes.forward().createIndexCreator(context, config)) { forwardIndexRewriteHelper(column, columnMetadata, forwardIndex, creator, columnMetadata.getTotalDocs(), null, dictionary); + return creator.getUncompressedSize(); } } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/SegmentGeneratorConfigPropagationTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/SegmentGeneratorConfigPropagationTest.java new file mode 100644 index 000000000000..5b5f6b62748c --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/SegmentGeneratorConfigPropagationTest.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.creator; + +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + + +/** + * Tests for T004: verify the {@code compressionStatsEnabled} flag propagates from + * {@link TableConfig} through to {@link SegmentGeneratorConfig}. + */ +public class SegmentGeneratorConfigPropagationTest { + + /** + * When {@code compressionStatsEnabled} is explicitly set to {@code true} on the + * {@link TableConfig}'s indexing config, the resulting {@link SegmentGeneratorConfig} + * should reflect that value. + */ + @Test + public void testCompressionStatsEnabledPropagation() { + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("testTable") + .build(); + tableConfig.getIndexingConfig().setCompressionStatsEnabled(true); + + Schema schema = new Schema.SchemaBuilder() + .setSchemaName("testTable") + .addSingleValueDimension("col1", DataType.INT) + .build(); + + SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema); + assertTrue(config.isCompressionStatsEnabled(), + "compressionStatsEnabled should be true when explicitly enabled on TableConfig"); + } + + /** + * When {@code compressionStatsEnabled} is never set on the {@link TableConfig}, the + * resulting {@link SegmentGeneratorConfig} should default to {@code false}. + */ + @Test + public void testCompressionStatsDisabledByDefault() { + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("testTable") + .build(); + + Schema schema = new Schema.SchemaBuilder() + .setSchemaName("testTable") + .addSingleValueDimension("col1", DataType.INT) + .build(); + + SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema); + assertFalse(config.isCompressionStatsEnabled(), + "compressionStatsEnabled should be false by default when not set on TableConfig"); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV2StatsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV2StatsTest.java new file mode 100644 index 000000000000..e96097d91464 --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/CLPForwardIndexCreatorV2StatsTest.java @@ -0,0 +1,152 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.creator.impl.fwd; + +import java.io.File; +import java.io.IOException; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.FixedByteChunkForwardIndexWriter; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV5; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Tests for CLP V2 sub-writer uncompressed size tracking and flag-disabled behavior (T010). + * + *

Since constructing a full {@code CLPForwardIndexCreatorV2} requires a complex + * {@code CLPMutableForwardIndexV2} setup, these tests validate the {@code setTrackUncompressedSize} + * and {@code getUncompressedSize} methods directly on the underlying writers used by CLP: + * {@link FixedByteChunkForwardIndexWriter} and {@link VarByteChunkForwardIndexWriterV5}. + */ +public class CLPForwardIndexCreatorV2StatsTest { + private static final int TOTAL_DOCS = 1000; + private static final int NUM_DOCS_PER_CHUNK = 100; + private static final int SIZE_OF_INT_ENTRY = Integer.BYTES; + private static final int WRITER_VERSION = 4; + private static final int VAR_BYTE_CHUNK_SIZE = 65536; + private static final ChunkCompressionType COMPRESSION_TYPE = ChunkCompressionType.ZSTANDARD; + private static final int NUM_ENTRIES_TO_WRITE = 500; + + private File _tempDir; + + @BeforeMethod + public void setUp() { + _tempDir = new File(FileUtils.getTempDirectory(), CLPForwardIndexCreatorV2StatsTest.class.getSimpleName()); + FileUtils.deleteQuietly(_tempDir); + _tempDir.mkdirs(); + } + + @AfterMethod + public void tearDown() { + FileUtils.deleteQuietly(_tempDir); + } + + /** + * Verifies that when tracking is disabled on a {@link FixedByteChunkForwardIndexWriter}, + * {@code getUncompressedSize()} returns 0 after writing data. + */ + @Test + public void testFixedByteWriterTrackingDisabled() + throws IOException { + File outputFile = new File(_tempDir, "fixed_byte_tracking_disabled.raw"); + try (FixedByteChunkForwardIndexWriter writer = new FixedByteChunkForwardIndexWriter( + outputFile, COMPRESSION_TYPE, TOTAL_DOCS, NUM_DOCS_PER_CHUNK, SIZE_OF_INT_ENTRY, WRITER_VERSION)) { + writer.setTrackUncompressedSize(false); + for (int i = 0; i < NUM_ENTRIES_TO_WRITE; i++) { + writer.putInt(i); + } + assertEquals(writer.getUncompressedSize(), 0L, + "Uncompressed size should be 0 when tracking is disabled"); + } + } + + /** + * Verifies that with default tracking enabled on a {@link FixedByteChunkForwardIndexWriter}, + * {@code getUncompressedSize()} returns a value greater than 0 after writing data. + */ + @Test + public void testFixedByteWriterTrackingEnabled() + throws IOException { + File outputFile = new File(_tempDir, "fixed_byte_tracking_enabled.raw"); + try (FixedByteChunkForwardIndexWriter writer = new FixedByteChunkForwardIndexWriter( + outputFile, COMPRESSION_TYPE, TOTAL_DOCS, NUM_DOCS_PER_CHUNK, SIZE_OF_INT_ENTRY, WRITER_VERSION)) { + for (int i = 0; i < NUM_ENTRIES_TO_WRITE; i++) { + writer.putInt(i); + } + assertTrue(writer.getUncompressedSize() > 0, + "Uncompressed size should be greater than 0 when tracking is enabled (default)"); + } + } + + /** + * Verifies that when tracking is disabled on a {@link VarByteChunkForwardIndexWriterV5}, + * {@code getUncompressedSize()} returns 0 after writing and closing. + * + *

The VarByte V4/V5 writer only records uncompressed size when a chunk is flushed + * (either when the chunk buffer fills up or during {@code close()}), so we must close + * the writer before asserting. + */ + @Test + public void testVarByteV5WriterTrackingDisabled() + throws IOException { + File outputFile = new File(_tempDir, "var_byte_v5_tracking_disabled.raw"); + VarByteChunkForwardIndexWriterV5 writer = new VarByteChunkForwardIndexWriterV5( + outputFile, COMPRESSION_TYPE, VAR_BYTE_CHUNK_SIZE); + try { + writer.setTrackUncompressedSize(false); + for (int i = 0; i < NUM_ENTRIES_TO_WRITE; i++) { + writer.putString("test-string-value-" + i); + } + } finally { + writer.close(); + } + assertEquals(writer.getUncompressedSize(), 0L, + "Uncompressed size should be 0 when tracking is disabled"); + } + + /** + * Verifies that with default tracking enabled on a {@link VarByteChunkForwardIndexWriterV5}, + * {@code getUncompressedSize()} returns a value greater than 0 after writing and closing. + * + *

The VarByte V4/V5 writer only records uncompressed size when a chunk is flushed + * (either when the chunk buffer fills up or during {@code close()}), so we must close + * the writer before asserting. + */ + @Test + public void testVarByteV5WriterTrackingEnabled() + throws IOException { + File outputFile = new File(_tempDir, "var_byte_v5_tracking_enabled.raw"); + VarByteChunkForwardIndexWriterV5 writer = new VarByteChunkForwardIndexWriterV5( + outputFile, COMPRESSION_TYPE, VAR_BYTE_CHUNK_SIZE); + try { + for (int i = 0; i < NUM_ENTRIES_TO_WRITE; i++) { + writer.putString("test-string-value-" + i); + } + } finally { + writer.close(); + } + assertTrue(writer.getUncompressedSize() > 0, + "Uncompressed size should be greater than 0 when tracking is enabled (default)"); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CLPForwardIndexCreatorV2Test.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CLPForwardIndexCreatorV2Test.java index 52b8292b4971..2e9b116c134d 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CLPForwardIndexCreatorV2Test.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CLPForwardIndexCreatorV2Test.java @@ -173,4 +173,46 @@ private long createClpImmutableForwardIndex(CLPMutableForwardIndexV2 clpMutableF File indexFile = new File(TEMP_DIR, COLUMN_NAME + V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); return indexFile.length(); } + + @Test + public void testGetUncompressedSizeAfterWriting() + throws IOException { + try (CLPMutableForwardIndexV2 mutable = new CLPMutableForwardIndexV2(COLUMN_NAME, _memoryManager)) { + for (int i = 0; i < _logMessages.size(); i++) { + mutable.setString(i, _logMessages.get(i)); + } + TestUtils.ensureDirectoriesExistAndEmpty(TEMP_DIR); + CLPForwardIndexCreatorV2 creator = + new CLPForwardIndexCreatorV2(TEMP_DIR, mutable, ChunkCompressionType.ZSTANDARD); + creator.setTrackUncompressedSize(true); + for (int i = 0; i < _logMessages.size(); i++) { + creator.putString(mutable.getString(i)); + } + creator.seal(); + creator.close(); + Assert.assertTrue(creator.getUncompressedSize() > 0, + "getUncompressedSize() should be > 0 after writing with tracking enabled"); + } + } + + @Test + public void testGetUncompressedSizeDisabledReturnsZero() + throws IOException { + try (CLPMutableForwardIndexV2 mutable = new CLPMutableForwardIndexV2(COLUMN_NAME, _memoryManager)) { + for (int i = 0; i < _logMessages.size(); i++) { + mutable.setString(i, _logMessages.get(i)); + } + TestUtils.ensureDirectoriesExistAndEmpty(TEMP_DIR); + CLPForwardIndexCreatorV2 creator = + new CLPForwardIndexCreatorV2(TEMP_DIR, mutable, ChunkCompressionType.ZSTANDARD); + creator.setTrackUncompressedSize(false); + for (int i = 0; i < _logMessages.size(); i++) { + creator.putString(mutable.getString(i)); + } + creator.seal(); + creator.close(); + Assert.assertEquals(creator.getUncompressedSize(), 0L, + "getUncompressedSize() should be 0 when tracking is disabled"); + } + } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CompressionStatsCornerCaseTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CompressionStatsCornerCaseTest.java new file mode 100644 index 000000000000..f30b38f054cb --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CompressionStatsCornerCaseTest.java @@ -0,0 +1,276 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; +import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader; +import org.apache.pinot.segment.spi.ColumnMetadata; +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; +import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; +import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.config.table.IndexingConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; +import org.apache.pinot.spi.utils.JsonUtils; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Corner case tests for the compression stats feature: + *

+ * + *

Uses the same segment building pattern as {@link CompressionStatsSegmentCreationTest}. + */ +public class CompressionStatsCornerCaseTest { + private static final File TEMP_DIR = + new File(FileUtils.getTempDirectory(), CompressionStatsCornerCaseTest.class.getSimpleName()); + private static final String RAW_TABLE_NAME = "compressionCornerCase"; + private static final String SEGMENT_NAME = "cornerCaseSegment"; + private static final int NUM_ROWS = 5000; + private static final Random RANDOM = new Random(42); + + private static final String INT_RAW_COL = "intRawCol"; + private static final String STRING_RAW_COL = "stringRawCol"; + private static final String DICT_COL = "dictCol"; + + @BeforeMethod + public void setUp() { + FileUtils.deleteQuietly(TEMP_DIR); + } + + @AfterMethod + public void tearDown() { + FileUtils.deleteQuietly(TEMP_DIR); + } + + /** + * Builds a segment with the given config. Uses the same proven pattern as + * {@link CompressionStatsSegmentCreationTest}. + */ + private File buildSegment(boolean compressionStatsEnabled, String compressionCodec) + throws Exception { + Schema schema = new Schema.SchemaBuilder().setSchemaName(RAW_TABLE_NAME) + .addSingleValueDimension(INT_RAW_COL, DataType.INT) + .addSingleValueDimension(STRING_RAW_COL, DataType.STRING) + .addSingleValueDimension(DICT_COL, DataType.STRING) + .build(); + + List fieldConfigs = new ArrayList<>(); + if (compressionCodec != null) { + FieldConfig.CompressionCodec codec = FieldConfig.CompressionCodec.valueOf(compressionCodec); + fieldConfigs.add(new FieldConfig(INT_RAW_COL, FieldConfig.EncodingType.RAW, List.of(), codec, null)); + fieldConfigs.add(new FieldConfig(STRING_RAW_COL, FieldConfig.EncodingType.RAW, List.of(), codec, null)); + } + + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setNoDictionaryColumns(List.of(INT_RAW_COL, STRING_RAW_COL)) + .setFieldConfigList(fieldConfigs) + .build(); + + if (compressionStatsEnabled) { + tableConfig.getIndexingConfig().setCompressionStatsEnabled(true); + } + + SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema); + config.setOutDir(TEMP_DIR.getAbsolutePath()); + config.setSegmentName(SEGMENT_NAME); + + List rows = generateTestData(); + SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); + driver.init(config, new GenericRowRecordReader(rows)); + driver.build(); + + return new File(TEMP_DIR, SEGMENT_NAME); + } + + private List generateTestData() { + List rows = new ArrayList<>(NUM_ROWS); + for (int i = 0; i < NUM_ROWS; i++) { + GenericRow row = new GenericRow(); + row.putValue(INT_RAW_COL, RANDOM.nextInt(100000)); + row.putValue(STRING_RAW_COL, RandomStringUtils.secure().nextAlphanumeric(20 + RANDOM.nextInt(80))); + row.putValue(DICT_COL, "value_" + (i % 100)); + rows.add(row); + } + return rows; + } + + @Test + public void testAllDictionaryColumnsNoCrash() + throws Exception { + // When ALL columns are dictionary-encoded, compression stats should gracefully produce no stats. + // This tests the division-by-zero safety (compressed = 0 → ratio = 0). + Schema schema = new Schema.SchemaBuilder().setSchemaName(RAW_TABLE_NAME) + .addSingleValueDimension(DICT_COL, DataType.STRING) + .addSingleValueDimension("dictCol2", DataType.INT) + .build(); + + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(RAW_TABLE_NAME) + .build(); + tableConfig.getIndexingConfig().setCompressionStatsEnabled(true); + + SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema); + config.setOutDir(TEMP_DIR.getAbsolutePath()); + config.setSegmentName(SEGMENT_NAME); + + List rows = new ArrayList<>(NUM_ROWS); + for (int i = 0; i < NUM_ROWS; i++) { + GenericRow row = new GenericRow(); + row.putValue(DICT_COL, "value_" + (i % 50)); + row.putValue("dictCol2", i % 100); + rows.add(row); + } + + SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); + driver.init(config, new GenericRowRecordReader(rows)); + driver.build(); + + File segmentDir = new File(TEMP_DIR, SEGMENT_NAME); + SegmentMetadataImpl metadata = new SegmentMetadataImpl(segmentDir); + + for (String colName : schema.getColumnNames()) { + ColumnMetadata colMeta = metadata.getColumnMetadataFor(colName); + assertTrue(colMeta.hasDictionary(), colName + " should have dictionary"); + assertEquals(colMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + colName + " should not have uncompressed forward index size"); + assertNull(colMeta.getCompressionCodec(), + colName + " should not have compression codec"); + } + } + + @Test + public void testFlagOffThenOnProducesStats() + throws Exception { + // Flag OFF: no stats persisted + File segmentDirOff = buildSegment(false, "LZ4"); + SegmentMetadataImpl metadataOff = new SegmentMetadataImpl(segmentDirOff); + + ColumnMetadata rawMetaOff = metadataOff.getColumnMetadataFor(INT_RAW_COL); + assertFalse(rawMetaOff.hasDictionary()); + assertEquals(rawMetaOff.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Flag OFF should not track uncompressed size"); + assertNull(rawMetaOff.getCompressionCodec(), + "Flag OFF should not track compression codec"); + + // Clean up and rebuild with flag ON + FileUtils.deleteQuietly(TEMP_DIR); + + // Flag ON: stats should be persisted + File segmentDirOn = buildSegment(true, "LZ4"); + SegmentMetadataImpl metadataOn = new SegmentMetadataImpl(segmentDirOn); + + ColumnMetadata rawMetaOn = metadataOn.getColumnMetadataFor(INT_RAW_COL); + assertFalse(rawMetaOn.hasDictionary()); + assertTrue(rawMetaOn.getUncompressedForwardIndexSizeBytes() > 0, + "Flag ON should track uncompressed size"); + assertEquals(rawMetaOn.getCompressionCodec(), "LZ4", + "Flag ON should track compression codec"); + } + + @Test + public void testOldSegmentWithoutStatsIsBackwardCompatible() + throws Exception { + // Simulate an "old" segment: raw columns but no compressionStatsEnabled, no field configs + File segmentDir = buildSegment(false, null); + SegmentMetadataImpl metadata = new SegmentMetadataImpl(segmentDir); + + ColumnMetadata rawMeta = metadata.getColumnMetadataFor(INT_RAW_COL); + assertNotNull(rawMeta); + assertFalse(rawMeta.hasDictionary()); + assertEquals(rawMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Old segment should return INDEX_NOT_FOUND for uncompressed size"); + assertNull(rawMeta.getCompressionCodec(), + "Old segment should return null for compression codec"); + + ColumnMetadata dictMeta = metadata.getColumnMetadataFor(DICT_COL); + assertNotNull(dictMeta); + assertTrue(dictMeta.hasDictionary()); + assertEquals(dictMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE); + assertNull(dictMeta.getCompressionCodec()); + } + + @Test + public void testIndexingConfigJsonRoundTrip() + throws Exception { + IndexingConfig original = new IndexingConfig(); + original.setCompressionStatsEnabled(true); + + String json = JsonUtils.objectToString(original); + assertTrue(json.contains("compressionStatsEnabled"), + "JSON should contain compressionStatsEnabled field"); + + IndexingConfig deserialized = JsonUtils.stringToObject(json, IndexingConfig.class); + assertTrue(deserialized.isCompressionStatsEnabled(), + "Deserialized config should have compressionStatsEnabled=true"); + + original.setCompressionStatsEnabled(false); + json = JsonUtils.objectToString(original); + deserialized = JsonUtils.stringToObject(json, IndexingConfig.class); + assertFalse(deserialized.isCompressionStatsEnabled(), + "Deserialized config should have compressionStatsEnabled=false"); + } + + @Test + public void testTableConfigJsonRoundTripWithCompressionStats() + throws Exception { + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("testTable") + .setNoDictionaryColumns(List.of("col1", "col2")) + .build(); + tableConfig.getIndexingConfig().setCompressionStatsEnabled(true); + + String json = JsonUtils.objectToString(tableConfig); + TableConfig deserialized = JsonUtils.stringToObject(json, TableConfig.class); + + assertTrue(deserialized.getIndexingConfig().isCompressionStatsEnabled(), + "Table config round-trip should preserve compressionStatsEnabled"); + assertEquals(deserialized.getIndexingConfig().getNoDictionaryColumns(), List.of("col1", "col2"), + "Table config round-trip should preserve noDictionaryColumns"); + } + + @Test + public void testOldIndexingConfigJsonWithoutFieldDeserializes() + throws Exception { + String oldJson = "{\"noDictionaryColumns\":[\"col1\"]}"; + IndexingConfig deserialized = JsonUtils.stringToObject(oldJson, IndexingConfig.class); + assertFalse(deserialized.isCompressionStatsEnabled(), + "Missing compressionStatsEnabled in IndexingConfig should default to false"); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CompressionStatsSegmentCreationTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CompressionStatsSegmentCreationTest.java new file mode 100644 index 000000000000..e5ce95f70451 --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/CompressionStatsSegmentCreationTest.java @@ -0,0 +1,281 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; +import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader; +import org.apache.pinot.segment.spi.ColumnMetadata; +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; +import org.apache.pinot.segment.spi.index.StandardIndexes; +import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; +import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.config.table.IndexingConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Tests that compression stats are correctly tracked and persisted during segment creation + * when compressionStatsEnabled is set in the table config. + */ +public class CompressionStatsSegmentCreationTest { + private static final File TEMP_DIR = + new File(FileUtils.getTempDirectory(), CompressionStatsSegmentCreationTest.class.getSimpleName()); + private static final String RAW_TABLE_NAME = "compressionStatsTable"; + private static final String SEGMENT_NAME = "compressionStatsSegment"; + private static final int NUM_ROWS = 5000; + private static final Random RANDOM = new Random(42); + + private static final String INT_RAW_COL = "intRawCol"; + private static final String STRING_RAW_COL = "stringRawCol"; + private static final String DICT_COL = "dictCol"; + + @BeforeMethod + public void setUp() { + FileUtils.deleteQuietly(TEMP_DIR); + } + + @AfterMethod + public void tearDown() { + FileUtils.deleteQuietly(TEMP_DIR); + } + + private List generateTestData() { + List rows = new ArrayList<>(NUM_ROWS); + for (int i = 0; i < NUM_ROWS; i++) { + GenericRow row = new GenericRow(); + row.putValue(INT_RAW_COL, RANDOM.nextInt(100000)); + row.putValue(STRING_RAW_COL, RandomStringUtils.secure().nextAlphanumeric(20 + RANDOM.nextInt(80))); + row.putValue(DICT_COL, "value_" + (i % 100)); + rows.add(row); + } + return rows; + } + + private File buildSegment(boolean compressionStatsEnabled, String compressionCodec) + throws Exception { + Schema schema = new Schema.SchemaBuilder().setSchemaName(RAW_TABLE_NAME) + .addSingleValueDimension(INT_RAW_COL, DataType.INT) + .addSingleValueDimension(STRING_RAW_COL, DataType.STRING) + .addSingleValueDimension(DICT_COL, DataType.STRING) + .build(); + + List fieldConfigs = new ArrayList<>(); + if (compressionCodec != null) { + FieldConfig.CompressionCodec codec = FieldConfig.CompressionCodec.valueOf(compressionCodec); + fieldConfigs.add(new FieldConfig(INT_RAW_COL, FieldConfig.EncodingType.RAW, List.of(), codec, null)); + fieldConfigs.add(new FieldConfig(STRING_RAW_COL, FieldConfig.EncodingType.RAW, List.of(), codec, null)); + } + + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setNoDictionaryColumns(List.of(INT_RAW_COL, STRING_RAW_COL)) + .setFieldConfigList(fieldConfigs) + .build(); + + if (compressionStatsEnabled) { + IndexingConfig indexingConfig = tableConfig.getIndexingConfig(); + indexingConfig.setCompressionStatsEnabled(true); + } + + SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema); + config.setOutDir(TEMP_DIR.getAbsolutePath()); + config.setSegmentName(SEGMENT_NAME); + + List rows = generateTestData(); + SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); + driver.init(config, new GenericRowRecordReader(rows)); + driver.build(); + + return new File(TEMP_DIR, SEGMENT_NAME); + } + + @Test + public void testCompressionStatsEnabled() + throws Exception { + File segmentDir = buildSegment(true, "LZ4"); + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(segmentDir); + + // Raw int column should have uncompressed size tracked + ColumnMetadata intMeta = metadata.getColumnMetadataFor(INT_RAW_COL); + assertNotNull(intMeta); + assertFalse(intMeta.hasDictionary()); + long intUncompressedSize = intMeta.getUncompressedForwardIndexSizeBytes(); + assertTrue(intUncompressedSize > 0, + "Uncompressed size for raw int column should be > 0, got: " + intUncompressedSize); + + // The uncompressed size reflects the total chunk buffer bytes written before compression. + // For fixed-width types this is the actual data chunked into chunk-buffer-sized blocks. + // It should be > 0 and in a reasonable range relative to the raw data size. + long rawDataSize = (long) NUM_ROWS * Integer.BYTES; + assertTrue(intUncompressedSize > 0 && intUncompressedSize <= rawDataSize * 2, + "Uncompressed int size " + intUncompressedSize + " should be > 0 and within 2x of raw data size " + + rawDataSize); + + // Compression codec should be persisted + assertEquals(intMeta.getCompressionCodec(), "LZ4"); + + // Raw string column should also have stats + ColumnMetadata stringMeta = metadata.getColumnMetadataFor(STRING_RAW_COL); + assertNotNull(stringMeta); + assertFalse(stringMeta.hasDictionary()); + long stringUncompressedSize = stringMeta.getUncompressedForwardIndexSizeBytes(); + assertTrue(stringUncompressedSize > 0, + "Uncompressed size for raw string column should be > 0, got: " + stringUncompressedSize); + assertEquals(stringMeta.getCompressionCodec(), "LZ4"); + + // The compressed forward index size should be less than uncompressed for random string data + long stringCompressedSize = stringMeta.getIndexSizeFor(StandardIndexes.forward()); + assertTrue(stringCompressedSize > 0, "Compressed forward index size should be > 0"); + // Note: for LZ4, random data may not compress well, but the sizes should be trackable + + // Verify compression ratio is meaningful + if (stringCompressedSize > 0 && stringUncompressedSize > 0) { + double ratio = (double) stringUncompressedSize / stringCompressedSize; + assertTrue(ratio > 0, "Compression ratio should be > 0, got: " + ratio); + } + + // Dictionary-encoded column should NOT have uncompressed forward index stats + ColumnMetadata dictMeta = metadata.getColumnMetadataFor(DICT_COL); + assertNotNull(dictMeta); + assertTrue(dictMeta.hasDictionary()); + assertEquals(dictMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Dictionary-encoded column should not have uncompressed forward index size"); + assertNull(dictMeta.getCompressionCodec(), + "Dictionary-encoded column should not have compression codec"); + } + + @Test + public void testCompressionStatsDisabled() + throws Exception { + File segmentDir = buildSegment(false, "LZ4"); + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(segmentDir); + + // When compressionStatsEnabled is false, no uncompressed size should be persisted + ColumnMetadata intMeta = metadata.getColumnMetadataFor(INT_RAW_COL); + assertNotNull(intMeta); + assertEquals(intMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Uncompressed size should not be tracked when compressionStatsEnabled is false"); + assertNull(intMeta.getCompressionCodec(), + "Compression codec should not be tracked when compressionStatsEnabled is false"); + } + + @Test + public void testCompressionStatsWithZstandard() + throws Exception { + File segmentDir = buildSegment(true, "ZSTANDARD"); + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(segmentDir); + + ColumnMetadata intMeta = metadata.getColumnMetadataFor(INT_RAW_COL); + assertTrue(intMeta.getUncompressedForwardIndexSizeBytes() > 0); + assertEquals(intMeta.getCompressionCodec(), "ZSTANDARD"); + + ColumnMetadata stringMeta = metadata.getColumnMetadataFor(STRING_RAW_COL); + assertTrue(stringMeta.getUncompressedForwardIndexSizeBytes() > 0); + assertEquals(stringMeta.getCompressionCodec(), "ZSTANDARD"); + } + + @Test + public void testCompressionStatsWithSnappy() + throws Exception { + File segmentDir = buildSegment(true, "SNAPPY"); + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(segmentDir); + + ColumnMetadata intMeta = metadata.getColumnMetadataFor(INT_RAW_COL); + assertTrue(intMeta.getUncompressedForwardIndexSizeBytes() > 0); + assertEquals(intMeta.getCompressionCodec(), "SNAPPY"); + } + + @Test + public void testDefaultCodecPersistedWhenNoExplicitConfig() + throws Exception { + // Build segment with compressionStatsEnabled=true but no explicit compression codec. + // The default codec (LZ4 for DIMENSION columns) should be resolved and persisted. + File segmentDir = buildSegment(true, null); + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(segmentDir); + + // Raw int column (DIMENSION type) should get LZ4 as default codec + ColumnMetadata intMeta = metadata.getColumnMetadataFor(INT_RAW_COL); + assertNotNull(intMeta); + assertFalse(intMeta.hasDictionary()); + assertEquals(intMeta.getCompressionCodec(), "LZ4", + "Default codec LZ4 should be persisted for DIMENSION column when no explicit codec configured"); + assertTrue(intMeta.getUncompressedForwardIndexSizeBytes() > 0, + "Uncompressed size should be > 0"); + + // Raw string column (DIMENSION type) should also get LZ4 + ColumnMetadata stringMeta = metadata.getColumnMetadataFor(STRING_RAW_COL); + assertNotNull(stringMeta); + assertFalse(stringMeta.hasDictionary()); + assertEquals(stringMeta.getCompressionCodec(), "LZ4", + "Default codec LZ4 should be persisted for DIMENSION string column"); + } + + @Test + public void testUncompressedSizeConsistencyAcrossCodecs() + throws Exception { + // Create segments with different codecs and verify uncompressed sizes are consistent + // (the raw data is the same, so uncompressed sizes should be identical) + File lz4Segment = buildSegment(true, "LZ4"); + SegmentMetadataImpl lz4Metadata = new SegmentMetadataImpl(lz4Segment); + long lz4IntUncompressed = lz4Metadata.getColumnMetadataFor(INT_RAW_COL).getUncompressedForwardIndexSizeBytes(); + long lz4StringUncompressed = + lz4Metadata.getColumnMetadataFor(STRING_RAW_COL).getUncompressedForwardIndexSizeBytes(); + + // Clean up and rebuild with different codec + FileUtils.deleteQuietly(TEMP_DIR); + + File zstdSegment = buildSegment(true, "ZSTANDARD"); + SegmentMetadataImpl zstdMetadata = new SegmentMetadataImpl(zstdSegment); + long zstdIntUncompressed = zstdMetadata.getColumnMetadataFor(INT_RAW_COL).getUncompressedForwardIndexSizeBytes(); + long zstdStringUncompressed = + zstdMetadata.getColumnMetadataFor(STRING_RAW_COL).getUncompressedForwardIndexSizeBytes(); + + // Fixed-width int column: uncompressed size should be exactly the same regardless of codec + assertEquals(lz4IntUncompressed, zstdIntUncompressed, + "Uncompressed size for fixed-width int column should be identical across codecs"); + + // Variable-width string column: uncompressed sizes may differ slightly due to chunk layout + // but should be within a reasonable range (within 10%) + double stringDiffPercent = + Math.abs((double) (lz4StringUncompressed - zstdStringUncompressed)) / lz4StringUncompressed * 100; + assertTrue(stringDiffPercent < 10, + "Uncompressed string sizes should be similar across codecs. LZ4=" + lz4StringUncompressed + + " ZSTD=" + zstdStringUncompressed + " diff=" + stringDiffPercent + "%"); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/ForwardIndexWriterUncompressedSizeTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/ForwardIndexWriterUncompressedSizeTest.java new file mode 100644 index 000000000000..1951fb9d009b --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/creator/ForwardIndexWriterUncompressedSizeTest.java @@ -0,0 +1,319 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.creator; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.UUID; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.writer.impl.FixedByteChunkForwardIndexWriter; +import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Tests that forward index writers correctly track uncompressed data size. + * + *

Verifies the {@code _uncompressedSize} field in both {@link BaseChunkForwardIndexWriter} + * (via {@link FixedByteChunkForwardIndexWriter}) and {@link VarByteChunkForwardIndexWriterV4} + * across multiple compression types. + * + *

Important: V4+ writers normalize numDocsPerChunk to the next power of 2. The uncompressed + * size only reflects complete chunks flushed before close; the last partial chunk is flushed + * during close(). Tests use power-of-2 doc counts to ensure exact matches when checking + * before close(). + */ +public class ForwardIndexWriterUncompressedSizeTest { + // Use power-of-2 aligned counts so V4's chunk normalization doesn't create partial chunks + private static final int NUM_DOCS = 1024; + private static final int DOCS_PER_CHUNK = 128; // already power-of-2, no normalization needed + private File _tempDir; + + @BeforeMethod + public void setUp() + throws IOException { + _tempDir = new File(FileUtils.getTempDirectory(), + ForwardIndexWriterUncompressedSizeTest.class.getSimpleName() + "_" + UUID.randomUUID()); + FileUtils.forceMkdir(_tempDir); + } + + @AfterMethod + public void tearDown() { + FileUtils.deleteQuietly(_tempDir); + } + + @DataProvider(name = "compressionTypes") + public Object[][] compressionTypes() { + return new Object[][]{ + {ChunkCompressionType.LZ4}, + {ChunkCompressionType.ZSTANDARD}, + {ChunkCompressionType.SNAPPY}, + {ChunkCompressionType.PASS_THROUGH} + }; + } + + @Test(dataProvider = "compressionTypes") + public void testFixedByteWriterIntTracksUncompressedSize(ChunkCompressionType compressionType) + throws IOException { + File file = new File(_tempDir, "fixedInt_" + compressionType.name()); + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, compressionType, NUM_DOCS, DOCS_PER_CHUNK, Integer.BYTES, 4)) { + for (int i = 0; i < NUM_DOCS; i++) { + writer.putInt(i); + } + // 1024 docs / 128 per chunk = 8 full chunks, all flushed before close + long expected = (long) NUM_DOCS * Integer.BYTES; + assertEquals(writer.getUncompressedSize(), expected, + "Uncompressed size should equal NUM_DOCS * INT_BYTES for " + compressionType); + } + } + + @Test(dataProvider = "compressionTypes") + public void testFixedByteWriterLongTracksUncompressedSize(ChunkCompressionType compressionType) + throws IOException { + File file = new File(_tempDir, "fixedLong_" + compressionType.name()); + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, compressionType, NUM_DOCS, DOCS_PER_CHUNK, Long.BYTES, 4)) { + for (int i = 0; i < NUM_DOCS; i++) { + writer.putLong(i * 1000L); + } + long expected = (long) NUM_DOCS * Long.BYTES; + assertEquals(writer.getUncompressedSize(), expected, + "Uncompressed size should equal NUM_DOCS * LONG_BYTES for " + compressionType); + } + } + + @Test(dataProvider = "compressionTypes") + public void testFixedByteWriterDoubleTracksUncompressedSize(ChunkCompressionType compressionType) + throws IOException { + File file = new File(_tempDir, "fixedDouble_" + compressionType.name()); + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, compressionType, NUM_DOCS, DOCS_PER_CHUNK, Double.BYTES, 4)) { + for (int i = 0; i < NUM_DOCS; i++) { + writer.putDouble(i * 0.5); + } + long expected = (long) NUM_DOCS * Double.BYTES; + assertEquals(writer.getUncompressedSize(), expected, + "Uncompressed size should equal NUM_DOCS * DOUBLE_BYTES for " + compressionType); + } + } + + @Test(dataProvider = "compressionTypes") + public void testVarByteV4WriterSVTracksUncompressedSize(ChunkCompressionType compressionType) + throws IOException { + File file = new File(_tempDir, "varByteSV_" + compressionType.name()); + String[] values = new String[NUM_DOCS]; + long totalRawBytes = 0; + for (int i = 0; i < NUM_DOCS; i++) { + values[i] = "test_string_" + i; + totalRawBytes += values[i].getBytes(StandardCharsets.UTF_8).length; + } + + try (VarByteChunkForwardIndexWriterV4 writer = + new VarByteChunkForwardIndexWriterV4(file, compressionType, 1024)) { + for (String value : values) { + writer.putString(value); + } + long uncompressedSize = writer.getUncompressedSize(); + assertTrue(uncompressedSize > 0, + "Uncompressed size should be > 0 for " + compressionType + ", got: " + uncompressedSize); + // V4 wraps each string in: 4-byte length prefix + raw bytes, so uncompressed size >= raw bytes + assertTrue(uncompressedSize >= totalRawBytes, + "Uncompressed size " + uncompressedSize + " should be >= raw string bytes " + totalRawBytes); + } + } + + @Test + public void testUncompressedSizeConsistentAcrossCompressionTypes() + throws IOException { + // Fixed-width INT column: uncompressed size must be EXACTLY the same regardless of compression type + long[] sizes = new long[4]; + ChunkCompressionType[] types = { + ChunkCompressionType.LZ4, ChunkCompressionType.ZSTANDARD, + ChunkCompressionType.SNAPPY, ChunkCompressionType.PASS_THROUGH + }; + + for (int t = 0; t < types.length; t++) { + File file = new File(_tempDir, "consistency_" + types[t].name()); + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, types[t], NUM_DOCS, DOCS_PER_CHUNK, Integer.BYTES, 4)) { + for (int i = 0; i < NUM_DOCS; i++) { + writer.putInt(i * 7); + } + sizes[t] = writer.getUncompressedSize(); + } + } + + for (int t = 1; t < types.length; t++) { + assertEquals(sizes[t], sizes[0], + "Uncompressed size should be identical for " + types[t] + " and " + types[0]); + } + } + + @Test + public void testVarByteV4UncompressedSizeConsistentAcrossCompressionTypes() + throws IOException { + // Variable-width STRING column: uncompressed size must be the same regardless of compression type + long[] sizes = new long[4]; + ChunkCompressionType[] types = { + ChunkCompressionType.LZ4, ChunkCompressionType.ZSTANDARD, + ChunkCompressionType.SNAPPY, ChunkCompressionType.PASS_THROUGH + }; + String[] values = new String[NUM_DOCS]; + for (int i = 0; i < NUM_DOCS; i++) { + values[i] = "consistent_value_" + i; + } + + for (int t = 0; t < types.length; t++) { + File file = new File(_tempDir, "varByteConsistency_" + types[t].name()); + try (VarByteChunkForwardIndexWriterV4 writer = + new VarByteChunkForwardIndexWriterV4(file, types[t], 1024)) { + for (String value : values) { + writer.putString(value); + } + sizes[t] = writer.getUncompressedSize(); + } + } + + for (int t = 1; t < types.length; t++) { + assertEquals(sizes[t], sizes[0], + "VarByte V4 uncompressed size should be identical for " + types[t] + " and " + types[0]); + } + } + + @Test + public void testPassthroughCompressionRatioIsOne() + throws IOException { + // With PASS_THROUGH compression, uncompressed size should still be tracked correctly. + File file = new File(_tempDir, "passthrough"); + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, ChunkCompressionType.PASS_THROUGH, NUM_DOCS, + DOCS_PER_CHUNK, Integer.BYTES, 4)) { + for (int i = 0; i < NUM_DOCS; i++) { + writer.putInt(i); + } + long uncompressedSize = writer.getUncompressedSize(); + assertEquals(uncompressedSize, (long) NUM_DOCS * Integer.BYTES, + "PASS_THROUGH uncompressed size should equal exact data size"); + } + // The file size should be >= uncompressed size (includes headers + data with no compression savings) + assertTrue(file.length() >= (long) NUM_DOCS * Integer.BYTES, + "PASS_THROUGH file size should be >= uncompressed data size"); + } + + @Test + public void testEmptyWriterHasZeroUncompressedSize() + throws IOException { + File file = new File(_tempDir, "empty"); + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, ChunkCompressionType.LZ4, 0, DOCS_PER_CHUNK, Integer.BYTES, 4)) { + assertEquals(writer.getUncompressedSize(), 0, "Empty writer should have 0 uncompressed size"); + } + } + + @Test + public void testSingleDocUncompressedSize() + throws IOException { + // V4 normalizes numDocsPerChunk=1 → 1 (already power-of-2). + // After writing 1 doc, the chunk is full → flushed immediately → uncompressed size = 4. + File file = new File(_tempDir, "singleDoc"); + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, ChunkCompressionType.LZ4, 1, 1, Integer.BYTES, 4)) { + writer.putInt(42); + assertEquals(writer.getUncompressedSize(), Integer.BYTES, + "Single INT doc should have uncompressed size = 4"); + } + } + + @Test + public void testMultipleChunksAccumulateCorrectly() + throws IOException { + // Use power-of-2 docs per chunk so each chunk boundary is predictable + File file = new File(_tempDir, "multiChunk"); + int docsPerChunk = 16; // power-of-2, no normalization + int totalDocs = 128; // 128 / 16 = 8 full chunks + try (FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, ChunkCompressionType.LZ4, totalDocs, + docsPerChunk, Integer.BYTES, 4)) { + for (int i = 0; i < totalDocs; i++) { + writer.putInt(i); + // After each full chunk, verify accumulated size + if ((i + 1) % docsPerChunk == 0) { + long expectedSoFar = (long) (i + 1) * Integer.BYTES; + assertEquals(writer.getUncompressedSize(), expectedSoFar, + "After " + (i + 1) + " docs, uncompressed size should be " + expectedSoFar); + } + } + assertEquals(writer.getUncompressedSize(), (long) totalDocs * Integer.BYTES); + } + } + + @Test + public void testVarByteV4MultiValueTracksUncompressedSize() + throws IOException { + File file = new File(_tempDir, "varByteMV"); + try (VarByteChunkForwardIndexWriterV4 writer = + new VarByteChunkForwardIndexWriterV4(file, ChunkCompressionType.LZ4, 4096)) { + for (int i = 0; i < 100; i++) { + String[] mvValues = {"value_" + i + "_a", "value_" + i + "_b", "value_" + i + "_c"}; + writer.putStringMV(mvValues); + } + assertTrue(writer.getUncompressedSize() > 0, + "MV writer should track non-zero uncompressed size"); + } + } + + @Test + public void testPartialChunkAccountedInClose() + throws IOException { + // Use non-aligned doc count so there's a partial chunk that's flushed during close() + // V4 normalizes 100 → 128 docs per chunk. 500 docs / 128 = 3 full chunks + 116 remaining. + // Before close: 3 * 128 * 4 = 1536 bytes. After close: 1536 + 116*4 = 2000 bytes. + File file = new File(_tempDir, "partialChunk"); + int totalDocs = 500; + int requestedDocsPerChunk = 100; // normalized to 128 by V4 + int normalizedDocsPerChunk = 128; + + FixedByteChunkForwardIndexWriter writer = + new FixedByteChunkForwardIndexWriter(file, ChunkCompressionType.LZ4, totalDocs, + requestedDocsPerChunk, Integer.BYTES, 4); + for (int i = 0; i < totalDocs; i++) { + writer.putInt(i); + } + + // With per-value tracking, all values are accounted for immediately (not per-chunk) + long expectedTotal = (long) totalDocs * Integer.BYTES; + assertEquals(writer.getUncompressedSize(), expectedTotal, + "Before close, all written values should be tracked"); + + // After close: same total — close flushes the chunk buffer but doesn't change uncompressed size + writer.close(); + assertEquals(writer.getUncompressedSize(), expectedTotal, + "After close, total uncompressed size should be unchanged"); + assertEquals(expectedTotal, (long) totalDocs * Integer.BYTES, + "Total uncompressed size should equal totalDocs * INT_BYTES"); + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexTypeTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexTypeTest.java index 43fde236a769..02bb40268b7d 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexTypeTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexTypeTest.java @@ -31,6 +31,7 @@ import org.apache.pinot.segment.spi.index.ForwardIndexConfig; import org.apache.pinot.segment.spi.index.StandardIndexes; import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.utils.JsonUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -495,4 +496,72 @@ public void testStandardIndex() { assertSame(StandardIndexes.forward(), StandardIndexes.forward(), "Standard index should use the same as " + "the ForwardIndexType static instance"); } + + @Test + public void testResolveCompressionType() { + // CLP codec → PASS_THROUGH + ForwardIndexConfig clpConfig = + new ForwardIndexConfig.Builder(FieldConfig.EncodingType.RAW) + .withCompressionCodec(FieldConfig.CompressionCodec.CLP) + .build(); + Assert.assertEquals(ForwardIndexType.resolveCompressionType(clpConfig, FieldSpec.FieldType.DIMENSION), + ChunkCompressionType.PASS_THROUGH, "CLP codec should resolve to PASS_THROUGH"); + + // CLPV2 codec → ZSTANDARD + ForwardIndexConfig clpv2Config = + new ForwardIndexConfig.Builder(FieldConfig.EncodingType.RAW) + .withCompressionCodec(FieldConfig.CompressionCodec.CLPV2) + .build(); + Assert.assertEquals(ForwardIndexType.resolveCompressionType(clpv2Config, FieldSpec.FieldType.DIMENSION), + ChunkCompressionType.ZSTANDARD, "CLPV2 codec should resolve to ZSTANDARD"); + + // CLPV2_ZSTD codec → ZSTANDARD + ForwardIndexConfig clpv2ZstdConfig = + new ForwardIndexConfig.Builder(FieldConfig.EncodingType.RAW) + .withCompressionCodec(FieldConfig.CompressionCodec.CLPV2_ZSTD) + .build(); + Assert.assertEquals(ForwardIndexType.resolveCompressionType(clpv2ZstdConfig, FieldSpec.FieldType.DIMENSION), + ChunkCompressionType.ZSTANDARD, "CLPV2_ZSTD codec should resolve to ZSTANDARD"); + + // CLPV2_LZ4 codec → LZ4 + ForwardIndexConfig clpv2Lz4Config = + new ForwardIndexConfig.Builder(FieldConfig.EncodingType.RAW) + .withCompressionCodec(FieldConfig.CompressionCodec.CLPV2_LZ4) + .build(); + Assert.assertEquals(ForwardIndexType.resolveCompressionType(clpv2Lz4Config, FieldSpec.FieldType.DIMENSION), + ChunkCompressionType.LZ4, "CLPV2_LZ4 codec should resolve to LZ4"); + + // Regular non-CLP codec (SNAPPY) → uses fwdConfig.getChunkCompressionType() + ForwardIndexConfig snappyConfig = + new ForwardIndexConfig.Builder(FieldConfig.EncodingType.RAW) + .withCompressionCodec(FieldConfig.CompressionCodec.SNAPPY) + .build(); + Assert.assertEquals(ForwardIndexType.resolveCompressionType(snappyConfig, FieldSpec.FieldType.DIMENSION), + ChunkCompressionType.SNAPPY, "SNAPPY codec should resolve to SNAPPY via getChunkCompressionType()"); + + // Regular non-CLP codec (ZSTANDARD) → uses fwdConfig.getChunkCompressionType() + ForwardIndexConfig zstdConfig = + new ForwardIndexConfig.Builder(FieldConfig.EncodingType.RAW) + .withCompressionCodec(FieldConfig.CompressionCodec.ZSTANDARD) + .build(); + Assert.assertEquals(ForwardIndexType.resolveCompressionType(zstdConfig, FieldSpec.FieldType.DIMENSION), + ChunkCompressionType.ZSTANDARD, "ZSTANDARD codec should resolve to ZSTANDARD via getChunkCompressionType()"); + + // No codec, no chunk compression type, fieldType=DIMENSION → falls back to getDefaultCompressionType (LZ4) + ForwardIndexConfig noneConfig = + new ForwardIndexConfig.Builder(FieldConfig.EncodingType.RAW) + .build(); + Assert.assertEquals(ForwardIndexType.resolveCompressionType(noneConfig, FieldSpec.FieldType.DIMENSION), + ChunkCompressionType.LZ4, + "No codec/compression with DIMENSION fieldType should fall back to LZ4 default"); + + // No codec, no chunk compression type, fieldType=METRIC → falls back to getDefaultCompressionType (PASS_THROUGH) + Assert.assertEquals(ForwardIndexType.resolveCompressionType(noneConfig, FieldSpec.FieldType.METRIC), + ChunkCompressionType.PASS_THROUGH, + "No codec/compression with METRIC fieldType should fall back to PASS_THROUGH default"); + + // No codec, no chunk compression type, null fieldType → returns null + Assert.assertNull(ForwardIndexType.resolveCompressionType(noneConfig, null), + "No codec/compression with null fieldType should return null"); + } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerCompressionStatsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerCompressionStatsTest.java new file mode 100644 index 000000000000..f4ab5b2fd2c8 --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerCompressionStatsTest.java @@ -0,0 +1,390 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.segment.index.loader; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; +import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader; +import org.apache.pinot.segment.local.segment.store.SegmentLocalFSDirectory; +import org.apache.pinot.segment.spi.ColumnMetadata; +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; +import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; +import org.apache.pinot.segment.spi.store.SegmentDirectory; +import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.config.table.FieldConfig.CompressionCodec; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; +import org.apache.pinot.spi.utils.ReadMode; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +/** + * Tests that compression stats metadata fields ({@code forwardIndex.compressionCodec} and + * {@code forwardIndex.uncompressedSizeBytes}) are correctly persisted during ForwardIndexHandler + * reload operations: + *

+ */ +public class ForwardIndexHandlerCompressionStatsTest { + private static final String RAW_TABLE_NAME = "compressionStatsReloadTest"; + private static final String SEGMENT_NAME = "compressionStatsReloadSegment"; + private static final File TEMP_DIR = + new File(FileUtils.getTempDirectory(), ForwardIndexHandlerCompressionStatsTest.class.getSimpleName()); + private static final File INDEX_DIR = new File(TEMP_DIR, SEGMENT_NAME); + + private static final String RAW_INT_COL = "rawIntCol"; + private static final String RAW_STRING_COL = "rawStringCol"; + private static final String DICT_INT_COL = "dictIntCol"; + private static final String DICT_STRING_COL = "dictStringCol"; + + // Use > 1024 rows to ensure multiple full chunks are flushed before close(). + // V4 writer normalizes numDocsPerChunk=1000 to 1024 (next power-of-2). + // With only 1000 rows, all data fits in one partial chunk (flushed only at close), + // so getUncompressedSize() called before close() would return 0. + private static final int NUM_ROWS = 5000; + private static final Random RANDOM = new Random(42); + + //@formatter:off + private static final Schema SCHEMA = new Schema.SchemaBuilder().setSchemaName(RAW_TABLE_NAME) + .addSingleValueDimension(RAW_INT_COL, DataType.INT) + .addSingleValueDimension(RAW_STRING_COL, DataType.STRING) + .addSingleValueDimension(DICT_INT_COL, DataType.INT) + .addSingleValueDimension(DICT_STRING_COL, DataType.STRING) + .build(); + //@formatter:on + + private static final List TEST_DATA; + + static { + TEST_DATA = new ArrayList<>(NUM_ROWS); + for (int i = 0; i < NUM_ROWS; i++) { + GenericRow row = new GenericRow(); + row.putValue(RAW_INT_COL, RANDOM.nextInt(100000)); + row.putValue(RAW_STRING_COL, "str_" + i + "_" + RANDOM.nextInt(10000)); + row.putValue(DICT_INT_COL, i % 100); + row.putValue(DICT_STRING_COL, "dict_" + (i % 50)); + TEST_DATA.add(row); + } + } + + private Set _noDictionaryColumns; + private Map _fieldConfigMap; + + @BeforeMethod + public void setUp() + throws Exception { + FileUtils.deleteQuietly(TEMP_DIR); + _noDictionaryColumns = new HashSet<>(List.of(RAW_INT_COL, RAW_STRING_COL)); + _fieldConfigMap = new HashMap<>(); + _fieldConfigMap.put(RAW_INT_COL, + new FieldConfig(RAW_INT_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.SNAPPY, null)); + _fieldConfigMap.put(RAW_STRING_COL, + new FieldConfig(RAW_STRING_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.SNAPPY, null)); + buildSegment(); + } + + @AfterMethod + public void tearDown() { + FileUtils.deleteQuietly(TEMP_DIR); + } + + private void buildSegment() + throws Exception { + TableConfig tableConfig = createTableConfig(); + SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, SCHEMA); + config.setOutDir(TEMP_DIR.getPath()); + config.setSegmentName(SEGMENT_NAME); + SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); + driver.init(config, new GenericRowRecordReader(TEST_DATA)); + driver.build(); + } + + private TableConfig createTableConfig() { + TableConfig config = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setNoDictionaryColumns(new ArrayList<>(_noDictionaryColumns)) + .setFieldConfigList(new ArrayList<>(_fieldConfigMap.values())) + .build(); + config.getIndexingConfig().setCompressionStatsEnabled(true); + return config; + } + + private IndexLoadingConfig createIndexLoadingConfig() { + return new IndexLoadingConfig(createTableConfig(), SCHEMA); + } + + @Test + public void testCompressionCodecPersistedOnCodecChange() + throws Exception { + // Change compression from SNAPPY to LZ4 for the raw int column + _fieldConfigMap.put(RAW_INT_COL, + new FieldConfig(RAW_INT_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.LZ4, null)); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + assertTrue(handler.needUpdateIndices(writer), "Handler should detect compression change"); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + // Validate that the new codec is persisted in metadata + SegmentMetadataImpl metadata = new SegmentMetadataImpl(INDEX_DIR); + ColumnMetadata colMeta = metadata.getColumnMetadataFor(RAW_INT_COL); + assertFalse(colMeta.hasDictionary()); + assertEquals(colMeta.getCompressionCodec(), "LZ4", + "Compression codec should be LZ4 after codec change"); + } + + @Test + public void testCompressionCodecPersistedOnMultipleCodecChanges() + throws Exception { + // First change: SNAPPY → LZ4 + _fieldConfigMap.put(RAW_INT_COL, + new FieldConfig(RAW_INT_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.LZ4, null)); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + SegmentMetadataImpl metadata1 = new SegmentMetadataImpl(INDEX_DIR); + assertEquals(metadata1.getColumnMetadataFor(RAW_INT_COL).getCompressionCodec(), "LZ4"); + + // Second change: LZ4 → ZSTANDARD + _fieldConfigMap.put(RAW_INT_COL, + new FieldConfig(RAW_INT_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.ZSTANDARD, null)); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + assertTrue(handler.needUpdateIndices(writer), "Handler should detect LZ4 → ZSTANDARD change"); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + SegmentMetadataImpl metadata2 = new SegmentMetadataImpl(INDEX_DIR); + assertEquals(metadata2.getColumnMetadataFor(RAW_INT_COL).getCompressionCodec(), "ZSTANDARD", + "Compression codec should be ZSTANDARD after second codec change"); + } + + @Test + public void testDictToRawPersistsCodecAndUncompressedSize() + throws Exception { + // Convert DICT_INT_COL from dictionary to raw with LZ4 compression + _noDictionaryColumns.add(DICT_INT_COL); + _fieldConfigMap.put(DICT_INT_COL, + new FieldConfig(DICT_INT_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.LZ4, null)); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + assertTrue(handler.needUpdateIndices(writer), "Handler should detect dict-to-raw change"); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + // Validate metadata + SegmentMetadataImpl metadata = new SegmentMetadataImpl(INDEX_DIR); + ColumnMetadata colMeta = metadata.getColumnMetadataFor(DICT_INT_COL); + assertFalse(colMeta.hasDictionary(), "Column should no longer have dictionary"); + assertEquals(colMeta.getCompressionCodec(), "LZ4", + "Compression codec should be LZ4 after dict-to-raw conversion"); + assertTrue(colMeta.getUncompressedForwardIndexSizeBytes() > 0, + "Uncompressed size should be > 0 after dict-to-raw conversion, got: " + + colMeta.getUncompressedForwardIndexSizeBytes()); + } + + @Test + public void testDictToRawStringColumnPersistsCodecAndUncompressedSize() + throws Exception { + // Convert DICT_STRING_COL from dictionary to raw with ZSTANDARD compression + _noDictionaryColumns.add(DICT_STRING_COL); + _fieldConfigMap.put(DICT_STRING_COL, + new FieldConfig(DICT_STRING_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.ZSTANDARD, null)); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + assertTrue(handler.needUpdateIndices(writer), "Handler should detect dict-to-raw change for string column"); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(INDEX_DIR); + ColumnMetadata colMeta = metadata.getColumnMetadataFor(DICT_STRING_COL); + assertFalse(colMeta.hasDictionary(), "String column should no longer have dictionary"); + assertEquals(colMeta.getCompressionCodec(), "ZSTANDARD", + "Compression codec should be ZSTANDARD after dict-to-raw conversion"); + assertTrue(colMeta.getUncompressedForwardIndexSizeBytes() > 0, + "Uncompressed size should be > 0 for string column after dict-to-raw, got: " + + colMeta.getUncompressedForwardIndexSizeBytes()); + } + + @Test + public void testCompressionCodecNotPersistedForDictColumns() + throws Exception { + // Dictionary columns should NOT have compression codec in metadata + SegmentMetadataImpl metadata = new SegmentMetadataImpl(INDEX_DIR); + + ColumnMetadata dictIntMeta = metadata.getColumnMetadataFor(DICT_INT_COL); + assertTrue(dictIntMeta.hasDictionary()); + assertNull(dictIntMeta.getCompressionCodec(), + "Dictionary column should not have compression codec in metadata"); + assertEquals(dictIntMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Dictionary column should not have uncompressed forward index size"); + + ColumnMetadata dictStringMeta = metadata.getColumnMetadataFor(DICT_STRING_COL); + assertTrue(dictStringMeta.hasDictionary()); + assertNull(dictStringMeta.getCompressionCodec(), + "Dictionary string column should not have compression codec"); + assertEquals(dictStringMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Dictionary string column should not have uncompressed forward index size"); + } + + @Test + public void testCodecChangeForStringColumn() + throws Exception { + // Change compression from SNAPPY to ZSTANDARD for the raw string column + _fieldConfigMap.put(RAW_STRING_COL, + new FieldConfig(RAW_STRING_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.ZSTANDARD, null)); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + assertTrue(handler.needUpdateIndices(writer), "Handler should detect SNAPPY → ZSTANDARD change"); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(INDEX_DIR); + ColumnMetadata colMeta = metadata.getColumnMetadataFor(RAW_STRING_COL); + assertFalse(colMeta.hasDictionary()); + assertEquals(colMeta.getCompressionCodec(), "ZSTANDARD", + "String column compression codec should be ZSTANDARD after change"); + } + + @Test + public void testUnchangedColumnsRetainOriginalMetadata() + throws Exception { + // Change compression for RAW_INT_COL only, RAW_STRING_COL should be unaffected + _fieldConfigMap.put(RAW_INT_COL, + new FieldConfig(RAW_INT_COL, FieldConfig.EncodingType.RAW, List.of(), CompressionCodec.LZ4, null)); + + SegmentMetadataImpl metadataBefore = new SegmentMetadataImpl(INDEX_DIR); + int totalDocsBefore = metadataBefore.getColumnMetadataFor(RAW_STRING_COL).getTotalDocs(); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + SegmentMetadataImpl metadataAfter = new SegmentMetadataImpl(INDEX_DIR); + + // The unchanged column should still have the same metadata + ColumnMetadata unchangedMeta = metadataAfter.getColumnMetadataFor(RAW_STRING_COL); + assertFalse(unchangedMeta.hasDictionary()); + assertEquals(unchangedMeta.getTotalDocs(), totalDocsBefore, + "Unchanged column should retain original totalDocs"); + + // The changed column should have the new codec + ColumnMetadata changedMeta = metadataAfter.getColumnMetadataFor(RAW_INT_COL); + assertEquals(changedMeta.getCompressionCodec(), "LZ4", + "Changed column should have LZ4 codec"); + } + + @Test + public void testDefaultCodecPersistedOnDictToRaw() + throws Exception { + // Convert DICT_INT_COL from dictionary to raw WITHOUT specifying a compression codec. + // The handler should resolve and persist the default codec (LZ4 for DIMENSION columns). + _noDictionaryColumns.add(DICT_INT_COL); + // Use EncodingType.RAW but no explicit compression codec (null → default) + _fieldConfigMap.put(DICT_INT_COL, + new FieldConfig(DICT_INT_COL, FieldConfig.EncodingType.RAW, List.of(), null, null)); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + assertTrue(handler.needUpdateIndices(writer), "Handler should detect dict-to-raw change"); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + SegmentMetadataImpl metadata = new SegmentMetadataImpl(INDEX_DIR); + ColumnMetadata colMeta = metadata.getColumnMetadataFor(DICT_INT_COL); + assertFalse(colMeta.hasDictionary(), "Column should no longer have dictionary"); + assertEquals(colMeta.getCompressionCodec(), "LZ4", + "Default codec LZ4 should be persisted for DIMENSION column even when no explicit codec configured"); + assertTrue(colMeta.getUncompressedForwardIndexSizeBytes() > 0, + "Uncompressed size should be > 0 after dict-to-raw conversion"); + } + + @Test + public void testRawToDictClearsCompressionStats() + throws Exception { + // First, verify that the raw column has compression stats persisted + SegmentMetadataImpl metadataBefore = new SegmentMetadataImpl(INDEX_DIR); + ColumnMetadata rawMeta = metadataBefore.getColumnMetadataFor(RAW_INT_COL); + assertFalse(rawMeta.hasDictionary(), "Column should start as raw"); + + // Convert RAW_INT_COL from raw to dictionary-encoded + _noDictionaryColumns.remove(RAW_INT_COL); + _fieldConfigMap.remove(RAW_INT_COL); + + try (SegmentDirectory segmentDirectory = new SegmentLocalFSDirectory(INDEX_DIR, ReadMode.mmap); + SegmentDirectory.Writer writer = segmentDirectory.createWriter()) { + ForwardIndexHandler handler = new ForwardIndexHandler(segmentDirectory, createIndexLoadingConfig()); + assertTrue(handler.needUpdateIndices(writer), "Handler should detect raw-to-dict change"); + handler.updateIndices(writer); + handler.postUpdateIndicesCleanup(writer); + } + + // Validate that compression stats metadata has been cleared + SegmentMetadataImpl metadataAfter = new SegmentMetadataImpl(INDEX_DIR); + ColumnMetadata dictMeta = metadataAfter.getColumnMetadataFor(RAW_INT_COL); + assertTrue(dictMeta.hasDictionary(), "Column should now have dictionary"); + assertNull(dictMeta.getCompressionCodec(), + "Compression codec should be cleared after raw-to-dict conversion"); + assertEquals(dictMeta.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Uncompressed forward index size should be cleared after raw-to-dict conversion"); + } +} diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java index f493db66dc5e..d4fd511afaa2 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/ColumnMetadata.java @@ -20,6 +20,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import javax.annotation.Nullable; import org.apache.pinot.segment.spi.index.IndexType; import org.apache.pinot.spi.annotations.InterfaceAudience; import org.apache.pinot.spi.config.table.FieldConfig.EncodingType; @@ -64,4 +65,19 @@ public interface ColumnMetadata extends ColumnShape { default int getColumnMaxLength() { return getLengthOfLongestElement(); } + + /** + * Returns the uncompressed forward index size in bytes, or {@link #UNAVAILABLE} if not available. + */ + default long getUncompressedForwardIndexSizeBytes() { + return UNAVAILABLE; + } + + /** + * Returns the compression codec used for this column's forward index, or null if not available. + */ + @Nullable + default String getCompressionCodec() { + return null; + } } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java index a6bb7da1103c..d990d0dc9084 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/V1Constants.java @@ -189,6 +189,8 @@ public static class Column { public static final String TOTAL_DOCS = "totalDocs"; public static final String COLUMN_PROPS_KEY_PREFIX = "column."; + public static final String FORWARD_INDEX_UNCOMPRESSED_SIZE = "forwardIndex.uncompressedSizeBytes"; + public static final String FORWARD_INDEX_COMPRESSION_CODEC = "forwardIndex.compressionCodec"; public static String getKeyFor(String column, String key) { return COLUMN_PROPS_KEY_PREFIX + column + "." + key; diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java index 0eb838934bbd..123f784c2d43 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/IndexCreationContext.java @@ -105,6 +105,10 @@ default Object getSortedUniqueElementsArray() { return columnStatistics != null ? columnStatistics.getUniqueValuesSet() : null; } + default boolean isCompressionStatsEnabled() { + return false; + } + @SuppressWarnings("UnusedReturnValue") final class Builder { // Identity. Non-overridable shape accessors delegate to `_columnShape`. @@ -137,6 +141,7 @@ final class Builder { // Error handling. private boolean _continueOnError; + private boolean _compressionStatsEnabled; /// Segment-creation path. Shape values come from the freshly-collected [ColumnStatistics]; `hasDictionary` is /// supplied separately because it's a driver decision not carried on [ColumnStatistics]. `_tableNameWithType` @@ -260,6 +265,11 @@ public Builder withContinueOnError(boolean continueOnError) { return this; } + public Builder withCompressionStatsEnabled(boolean compressionStatsEnabled) { + _compressionStatsEnabled = compressionStatsEnabled; + return this; + } + public Common build() { return new Common(this); } @@ -295,6 +305,7 @@ final class Common implements IndexCreationContext { // Error handling. private final boolean _continueOnError; + private final boolean _compressionStatsEnabled; private Common(Builder builder) { _tableNameWithType = builder._tableNameWithType; @@ -315,6 +326,7 @@ private Common(Builder builder) { _mutableSegmentCompacted = builder._mutableSegmentCompacted; _mutableToImmutableDocIdMap = builder._mutableToImmutableDocIdMap; _continueOnError = builder._continueOnError; + _compressionStatsEnabled = builder._compressionStatsEnabled; } // Identity accessors. @@ -471,5 +483,10 @@ public int[] getMutableToImmutableDocIdMap() { public boolean isContinueOnError() { return _continueOnError; } + + @Override + public boolean isCompressionStatsEnabled() { + return _compressionStatsEnabled; + } } } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java index 7c62a2e40a3f..0d5e3588eaf9 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/creator/SegmentGeneratorConfig.java @@ -128,6 +128,7 @@ public enum TimeColumnType { // Type of the instance (SERVER/MINION) that is trying to create the segment. private InstanceType _instanceType; + private boolean _compressionStatsEnabled; /** * Constructs the SegmentGeneratorConfig with table config and schema. @@ -172,6 +173,7 @@ public SegmentGeneratorConfig(TableConfig tableConfig, Schema schema) { setStarTreeIndexConfigs(indexingConfig.getStarTreeIndexConfigs()); setEnableDefaultStarTree(indexingConfig.isEnableDefaultStarTree()); _multiColumnTextIndexConfig = indexingConfig.getMultiColumnTextIndexConfig(); + _compressionStatsEnabled = indexingConfig.isCompressionStatsEnabled(); List fieldConfigs = tableConfig.getFieldConfigList(); if (fieldConfigs != null) { @@ -721,4 +723,12 @@ public InstanceType getInstanceType() { public void setInstanceType(InstanceType instanceType) { _instanceType = instanceType; } + + public boolean isCompressionStatsEnabled() { + return _compressionStatsEnabled; + } + + public void setCompressionStatsEnabled(boolean compressionStatsEnabled) { + _compressionStatsEnabled = compressionStatsEnabled; + } } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/ForwardIndexCreator.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/ForwardIndexCreator.java index afb52dfeac4b..078674bcacc8 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/ForwardIndexCreator.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/creator/ForwardIndexCreator.java @@ -309,6 +309,20 @@ default void addBytesMV(byte[][] values, @Nullable int[] dictIds) { */ DataType getValueType(); + /** + * Returns the total uncompressed size of the forward index data written, or 0 if not tracked. + */ + default long getUncompressedSize() { + return 0; + } + + /** + * Controls whether the writer tracks uncompressed data size. When disabled, the writer skips + * the per-chunk size accumulation, providing zero overhead when compression stats are not needed. + */ + default void setTrackUncompressedSize(boolean trackUncompressedSize) { + } + /** * DICTIONARY-ENCODED INDEX APIs */ diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java index 7fa13f93ab96..0c431e3e5d10 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java @@ -76,6 +76,8 @@ public class ColumnMetadataImpl implements ColumnMetadata { private final PartitionFunction _partitionFunction; private final Set _partitions; private final boolean _autoGenerated; + private final long _uncompressedForwardIndexSizeBytes; + private final String _compressionCodec; /// List of longs, each encodes: /// - 2 byte - numeric id of IndexType @@ -88,7 +90,8 @@ private ColumnMetadataImpl(FieldSpec fieldSpec, int totalDocs, int cardinality, EncodingType forwardIndexEncoding, boolean sorted, @Nullable Comparable minValue, @Nullable Comparable maxValue, boolean minMaxValueInvalid, int lengthOfShortestElement, int lengthOfLongestElement, boolean isAscii, int totalNumberOfEntries, int maxNumberOfMultiValues, int maxRowLengthInBytes, int bitsPerElement, - @Nullable PartitionFunction partitionFunction, @Nullable Set partitions, boolean autoGenerated) { + @Nullable PartitionFunction partitionFunction, @Nullable Set partitions, boolean autoGenerated, + long uncompressedForwardIndexSizeBytes, @Nullable String compressionCodec) { _fieldSpec = fieldSpec; _totalDocs = totalDocs; _cardinality = cardinality; @@ -108,6 +111,8 @@ private ColumnMetadataImpl(FieldSpec fieldSpec, int totalDocs, int cardinality, _partitionFunction = partitionFunction; _partitions = partitions; _autoGenerated = autoGenerated; + _uncompressedForwardIndexSizeBytes = uncompressedForwardIndexSizeBytes; + _compressionCodec = compressionCodec; } @Override @@ -253,6 +258,17 @@ private static long unpackIndexSize(long typeAndSize) { return typeAndSize & SIZE_MASK; } + @Override + public long getUncompressedForwardIndexSizeBytes() { + return _uncompressedForwardIndexSizeBytes; + } + + @Override + @Nullable + public String getCompressionCodec() { + return _compressionCodec; + } + @Override public boolean equals(Object o) { if (this == o) { @@ -374,6 +390,12 @@ public static ColumnMetadataImpl fromPropertiesConfiguration(PropertiesConfigura builder.setPartitions(extractPartitions(column, config)); } + // Read compression stats if available + builder.setUncompressedForwardIndexSizeBytes( + config.getLong(Column.getKeyFor(column, Column.FORWARD_INDEX_UNCOMPRESSED_SIZE), UNAVAILABLE)); + builder.setCompressionCodec( + config.getString(Column.getKeyFor(column, Column.FORWARD_INDEX_COMPRESSION_CODEC), null)); + return builder.build(); } @@ -507,6 +529,8 @@ public static class Builder { private PartitionFunction _partitionFunction; private Set _partitions; private boolean _autoGenerated; + private long _uncompressedForwardIndexSizeBytes = UNAVAILABLE; + private String _compressionCodec; public Builder setFieldSpec(FieldSpec fieldSpec) { _fieldSpec = fieldSpec; @@ -608,6 +632,16 @@ public Builder setAutoGenerated(boolean autoGenerated) { return this; } + public Builder setUncompressedForwardIndexSizeBytes(long uncompressedForwardIndexSizeBytes) { + _uncompressedForwardIndexSizeBytes = uncompressedForwardIndexSizeBytes; + return this; + } + + public Builder setCompressionCodec(@Nullable String compressionCodec) { + _compressionCodec = compressionCodec; + return this; + } + public ColumnMetadataImpl build() { // Canonicalize forward index encoding if (_forwardIndexEncoding == null) { @@ -647,7 +681,7 @@ public ColumnMetadataImpl build() { return new ColumnMetadataImpl(_fieldSpec, _totalDocs, _cardinality, _hasDictionary, _forwardIndexEncoding, _sorted, _minValue, _maxValue, _minMaxValueInvalid, _lengthOfShortestElement, _lengthOfLongestElement, _isAscii, _totalNumberOfEntries, _maxNumberOfMultiValues, _maxRowLengthInBytes, _bitsPerElement, - _partitionFunction, _partitions, _autoGenerated); + _partitionFunction, _partitions, _autoGenerated, _uncompressedForwardIndexSizeBytes, _compressionCodec); } } } diff --git a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImplTest.java b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImplTest.java index 0b994799933c..00fad17c8fb3 100644 --- a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImplTest.java +++ b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImplTest.java @@ -19,6 +19,7 @@ package org.apache.pinot.segment.spi.index.metadata; import org.apache.commons.configuration2.PropertiesConfiguration; +import org.apache.pinot.segment.spi.ColumnMetadata; import org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column; import org.apache.pinot.spi.config.table.FieldConfig.EncodingType; import org.apache.pinot.spi.data.FieldSpec.DataType; @@ -27,6 +28,7 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; @@ -106,6 +108,33 @@ public void honorsExplicitDictionaryEncoding() { assertEquals(metadata.getForwardIndexEncoding(), EncodingType.DICTIONARY); } + @Test + public void compressionStatsPersistedAndLoaded() { + PropertiesConfiguration config = baseConfig("rawCol"); + config.setProperty(Column.getKeyFor("rawCol", Column.HAS_DICTIONARY), false); + config.setProperty(Column.getKeyFor("rawCol", Column.FORWARD_INDEX_UNCOMPRESSED_SIZE), 4096L); + config.setProperty(Column.getKeyFor("rawCol", Column.FORWARD_INDEX_COMPRESSION_CODEC), "LZ4"); + + ColumnMetadataImpl metadata = ColumnMetadataImpl.fromPropertiesConfiguration(config, 1, "rawCol"); + + assertEquals(metadata.getUncompressedForwardIndexSizeBytes(), 4096L); + assertEquals(metadata.getCompressionCodec(), "LZ4"); + } + + @Test + public void compressionStatsDefaultToUnavailableOnOldSegment() { + PropertiesConfiguration config = baseConfig("col"); + config.setProperty(Column.getKeyFor("col", Column.HAS_DICTIONARY), false); + // Neither FORWARD_INDEX_UNCOMPRESSED_SIZE nor FORWARD_INDEX_COMPRESSION_CODEC set + + ColumnMetadataImpl metadata = ColumnMetadataImpl.fromPropertiesConfiguration(config, 1, "col"); + + assertEquals(metadata.getUncompressedForwardIndexSizeBytes(), ColumnMetadata.UNAVAILABLE, + "Old segments without compression stats should return UNAVAILABLE"); + assertNull(metadata.getCompressionCodec(), + "Old segments without compression codec should return null"); + } + private static PropertiesConfiguration baseConfig(String column) { PropertiesConfiguration config = new PropertiesConfiguration(); config.setProperty(Column.getKeyFor(column, Column.COLUMN_NAME), column); diff --git a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/SegmentMetadataUtilsTest.java b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/SegmentMetadataUtilsTest.java new file mode 100644 index 000000000000..a78482fc1675 --- /dev/null +++ b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/utils/SegmentMetadataUtilsTest.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.spi.utils; + +import java.util.HashMap; +import java.util.Map; +import org.apache.commons.configuration2.PropertiesConfiguration; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** + * Unit tests for the null-clearing behavior in {@link SegmentMetadataUtils#updateMetadataProperties}. + * + *

The method iterates over the supplied map and calls {@code clearProperty} for entries whose + * value is {@code null}, and {@code setProperty} for non-null values. This class verifies that + * branching logic directly against {@link PropertiesConfiguration}, which is the backing store + * used by {@code updateMetadataProperties}. + */ +public class SegmentMetadataUtilsTest { + + /** + * Replicates the null-dispatch loop from {@code updateMetadataProperties} and verifies that: + *

    + *
  • A non-null map entry value sets the property on the configuration.
  • + *
  • A null map entry value removes (clears) the property from the configuration.
  • + *
+ */ + @Test + public void testNullValueClearsProperty() { + PropertiesConfiguration config = new PropertiesConfiguration(); + + // Seed an existing property that will be cleared by a null map value. + config.setProperty("existingKey", "oldValue"); + + // Build the update map: one non-null entry (set) and one null entry (clear). + Map updates = new HashMap<>(); + updates.put("newKey", "newValue"); + updates.put("existingKey", null); + + // Apply the same branching logic as updateMetadataProperties. + for (Map.Entry entry : updates.entrySet()) { + if (entry.getValue() == null) { + config.clearProperty(entry.getKey()); + } else { + config.setProperty(entry.getKey(), entry.getValue()); + } + } + + // Non-null entry must be present with its value. + Assert.assertEquals(config.getString("newKey"), "newValue", + "Non-null map value should set the property"); + + // Null entry must have been removed from the configuration. + Assert.assertFalse(config.containsKey("existingKey"), + "Null map value should clear the property so it is no longer present"); + } + + /** + * Verifies that setting a property with a non-null value overwrites a previously stored value, + * which is the standard {@code setProperty} contract expected by the update path. + */ + @Test + public void testNonNullValueOverwritesExistingProperty() { + PropertiesConfiguration config = new PropertiesConfiguration(); + config.setProperty("key", "original"); + + Map updates = new HashMap<>(); + updates.put("key", "updated"); + + for (Map.Entry entry : updates.entrySet()) { + if (entry.getValue() == null) { + config.clearProperty(entry.getKey()); + } else { + config.setProperty(entry.getKey(), entry.getValue()); + } + } + + Assert.assertEquals(config.getString("key"), "updated", + "Non-null map value should overwrite an existing property"); + } +} diff --git a/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TableSizeResource.java b/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TableSizeResource.java index 489d4a6867e6..2957b125b56e 100644 --- a/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TableSizeResource.java +++ b/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TableSizeResource.java @@ -28,7 +28,9 @@ import io.swagger.annotations.SecurityDefinition; import io.swagger.annotations.SwaggerDefinition; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import javax.inject.Inject; import javax.ws.rs.DefaultValue; import javax.ws.rs.GET; @@ -41,6 +43,8 @@ import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; import org.apache.pinot.common.restlet.resources.ResourceUtils; import org.apache.pinot.common.restlet.resources.SegmentSizeInfo; import org.apache.pinot.common.restlet.resources.TableSizeInfo; @@ -49,8 +53,13 @@ import org.apache.pinot.core.data.manager.offline.ImmutableSegmentDataManager; import org.apache.pinot.segment.local.data.manager.SegmentDataManager; import org.apache.pinot.segment.local.data.manager.TableDataManager; +import org.apache.pinot.segment.spi.ColumnMetadata; import org.apache.pinot.segment.spi.ImmutableSegment; +import org.apache.pinot.segment.spi.SegmentMetadata; +import org.apache.pinot.segment.spi.index.IndexService; +import org.apache.pinot.segment.spi.index.StandardIndexes; import org.apache.pinot.server.starter.ServerInstance; +import org.apache.pinot.spi.config.table.TableConfig; import static org.apache.pinot.spi.utils.CommonConstants.DATABASE; import static org.apache.pinot.spi.utils.CommonConstants.SWAGGER_AUTHORIZATION_KEY; @@ -100,6 +109,12 @@ public String getTableSize( throw new WebApplicationException("Table: " + tableName + " is not found", Response.Status.NOT_FOUND); } + // Check feature flag — only collect per-column compression stats if enabled + Pair cachedPair = tableDataManager.getCachedTableConfigAndSchema(); + boolean compressionStatsEnabled = cachedPair != null && cachedPair.getLeft() != null + && cachedPair.getLeft().getIndexingConfig() != null + && cachedPair.getLeft().getIndexingConfig().isCompressionStatsEnabled(); + long tableSizeInBytes = 0L; List segmentSizeInfos = new ArrayList<>(); List segmentDataManagers = tableDataManager.acquireAllSegments(); @@ -109,7 +124,47 @@ public String getTableSize( ImmutableSegment immutableSegment = (ImmutableSegment) segmentDataManager.getSegment(); long segmentSizeBytes = immutableSegment.getSegmentSizeBytes(); if (detailed) { - segmentSizeInfos.add(new SegmentSizeInfo(immutableSegment.getSegmentName(), segmentSizeBytes)); + if (compressionStatsEnabled) { + long rawFwdIndexSize = 0; + long compressedFwdIndexSize = 0; + Map columnCompressionStats = null; + IndexService indexService = IndexService.getInstance(); + SegmentMetadata segmentMetadata = immutableSegment.getSegmentMetadata(); + for (ColumnMetadata colMeta : segmentMetadata.getColumnMetadataMap().values()) { + long uncompressed = colMeta.getUncompressedForwardIndexSizeBytes(); + if (uncompressed > 0) { + rawFwdIndexSize += uncompressed; + } + long fwdIndexSize = colMeta.getIndexSizeFor(StandardIndexes.forward()); + if (fwdIndexSize > 0) { + if (uncompressed > 0) { + compressedFwdIndexSize += fwdIndexSize; + } + // Skip old raw segments that lack a persisted compression codec + if (colMeta.getCompressionCodec() == null && !colMeta.hasDictionary()) { + continue; + } + double ratio = (uncompressed > 0) ? (double) uncompressed / fwdIndexSize : 0; + List indexNames = new ArrayList<>(); + for (int i = 0, n = colMeta.getNumIndexes(); i < n; i++) { + indexNames.add(indexService.get(colMeta.getIndexType(i)).getId()); + } + if (columnCompressionStats == null) { + columnCompressionStats = new HashMap<>(); + } + columnCompressionStats.put(colMeta.getColumnName(), + new ColumnCompressionStatsInfo(colMeta.getColumnName(), + uncompressed, fwdIndexSize, ratio, + colMeta.getCompressionCodec(), colMeta.hasDictionary(), + indexNames.isEmpty() ? null : indexNames)); + } + } + segmentSizeInfos.add(new SegmentSizeInfo(immutableSegment.getSegmentName(), segmentSizeBytes, + rawFwdIndexSize, compressedFwdIndexSize, immutableSegment.getTier(), columnCompressionStats)); + } else { + segmentSizeInfos.add(new SegmentSizeInfo(immutableSegment.getSegmentName(), segmentSizeBytes, + -1, -1, immutableSegment.getTier())); + } } tableSizeInBytes += segmentSizeBytes; } diff --git a/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java b/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java index dd3409d77032..835b1df7629c 100644 --- a/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java +++ b/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java @@ -68,9 +68,12 @@ import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; import org.apache.pinot.common.metadata.segment.SegmentZKMetadataUtils; import org.apache.pinot.common.response.server.TableIndexMetadataResponse; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; +import org.apache.pinot.common.restlet.resources.CompressionStatsSummary; import org.apache.pinot.common.restlet.resources.ResourceUtils; import org.apache.pinot.common.restlet.resources.SegmentConsumerInfo; import org.apache.pinot.common.restlet.resources.ServerSegmentsReloadCheckResponse; +import org.apache.pinot.common.restlet.resources.StorageBreakdownInfo; import org.apache.pinot.common.restlet.resources.TableLLCSegmentUploadResponse; import org.apache.pinot.common.restlet.resources.TableMetadataInfo; import org.apache.pinot.common.restlet.resources.TableSegmentValidationInfo; @@ -104,10 +107,12 @@ import org.apache.pinot.segment.spi.V1Constants; import org.apache.pinot.segment.spi.datasource.DataSource; import org.apache.pinot.segment.spi.index.IndexService; +import org.apache.pinot.segment.spi.index.StandardIndexes; import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; import org.apache.pinot.server.access.AccessControlFactory; import org.apache.pinot.server.api.AdminApiApplication; import org.apache.pinot.server.starter.ServerInstance; +import org.apache.pinot.spi.config.table.TableConfig; import org.apache.pinot.spi.config.table.TableType; import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.stream.ConsumerPartitionState; @@ -229,6 +234,21 @@ public String getSegmentMetadata( Map columnCardinalityMap = new HashMap<>(); Map maxNumMultiValuesMap = new HashMap<>(); Map> columnIndexSizesMap = new HashMap<>(); + // Per-column compression stats accumulators: [0]=uncompressed, [1]=compressed (fwd index) + Map columnCompressionAccum = new HashMap<>(); + Map columnCodecMap = new HashMap<>(); + // Track hasDictionary and index names per column for the compression stats DTO + Map columnHasDictMap = new HashMap<>(); + Map> columnIndexNamesMap = new HashMap<>(); + Map tierAccum = new HashMap<>(); // [count, size] + int segmentsWithStats = 0; + + // Check feature flag — only collect compression stats if enabled + Pair cachedPair = tableDataManager.getCachedTableConfigAndSchema(); + boolean compressionStatsEnabled = cachedPair != null && cachedPair.getLeft() != null + && cachedPair.getLeft().getIndexingConfig() != null + && cachedPair.getLeft().getIndexingConfig().isCompressionStatsEnabled(); + try { for (SegmentDataManager segmentDataManager : segmentDataManagers) { if (segmentDataManager instanceof ImmutableSegmentDataManager) { @@ -245,6 +265,8 @@ public String getSegmentMetadata( } else { columnSet.retainAll(segmentMetadata.getAllColumns()); } + boolean segmentHasCompressionStats = false; + IndexService indexService = IndexService.getInstance(); for (String column : columnSet) { ColumnMetadata columnMetadata = segmentMetadata.getColumnMetadataMap().get(column); int columnLength = columnMetadata.getLengthOfLongestElement(); @@ -270,7 +292,7 @@ public String getSegmentMetadata( maxNumMultiValuesMap.merge(column, (double) maxNumMultiValues, Double::sum); } - IndexService indexService = IndexService.getInstance(); + List indexNames = new ArrayList<>(); for (int i = 0, n = columnMetadata.getNumIndexes(); i < n; i++) { String indexName = indexService.get(columnMetadata.getIndexType(i)).getId(); long value = columnMetadata.getIndexSize(i); @@ -279,8 +301,49 @@ public String getSegmentMetadata( Double indexSize = columnIndexSizes.getOrDefault(indexName, 0d) + value; columnIndexSizes.put(indexName, indexSize); columnIndexSizesMap.put(column, columnIndexSizes); + + indexNames.add(indexName); + } + + // Collect per-column compression stats when feature flag is enabled + if (compressionStatsEnabled) { + String codec = columnMetadata.getCompressionCodec(); + long uncompressedSize = columnMetadata.getUncompressedForwardIndexSizeBytes(); + long fwdIndexSize = columnMetadata.getIndexSizeFor(StandardIndexes.forward()); + if (codec != null && uncompressedSize > 0) { + // Raw column with stats: include in both numerator and denominator + long[] accum = columnCompressionAccum.computeIfAbsent(column, k -> new long[2]); + accum[0] += uncompressedSize; + accum[1] += (fwdIndexSize > 0 ? fwdIndexSize : 0); + columnCodecMap.merge(column, codec, + (existing, incoming) -> existing.equals(incoming) ? existing : "MIXED"); + // Raw columns never have a dictionary; once any segment is raw, mark the column as no-dict + columnHasDictMap.merge(column, false, (existing, incoming) -> false); + columnIndexNamesMap.computeIfAbsent(column, k -> new HashSet<>()).addAll(indexNames); + segmentHasCompressionStats = true; + } else if (columnMetadata.hasDictionary() && fwdIndexSize > 0) { + // Dictionary-encoded column: track forward index size but no raw uncompressed size + long[] accum = columnCompressionAccum.computeIfAbsent(column, k -> new long[2]); + accum[1] += fwdIndexSize; + // Only set hasDictionary=true if not already seen as raw (raw wins) + columnHasDictMap.merge(column, true, (existing, incoming) -> existing && incoming); + columnIndexNamesMap.computeIfAbsent(column, k -> new HashSet<>()).addAll(indexNames); + } + // Old segments without stats (codec==null, uncompressed==INDEX_NOT_FOUND) are + // excluded entirely — not added to any accumulation maps } } + + if (segmentHasCompressionStats) { + segmentsWithStats++; + } + + // Accumulate storage breakdown by tier (always-on, not gated by compression flag) + String tier = immutableSegment.getTier(); + String tierKey = tier != null ? tier : "default"; + long[] tierVals = tierAccum.computeIfAbsent(tierKey, k -> new long[2]); + tierVals[0]++; + tierVals[1] += segmentSizeBytes; } } } finally { @@ -301,10 +364,57 @@ public String getSegmentMetadata( (partition, primaryKeyCount) -> partitionToServerPrimaryKeyCountMap.put(partition, Map.of(instanceDataManager.getInstanceId(), primaryKeyCount))); + // Build per-column compression stats list if flag is enabled and any columns have stats + List columnCompressionStats = null; + CompressionStatsSummary compressionStatsSummary = null; + if (compressionStatsEnabled && !columnCompressionAccum.isEmpty()) { + columnCompressionStats = new ArrayList<>(); + long totalRaw = 0; + long totalCompressed = 0; + int totalSegmentCount = segmentDataManagers.size(); + for (Map.Entry entry : columnCompressionAccum.entrySet()) { + String col = entry.getKey(); + long[] accum = entry.getValue(); + boolean hasDictionary = Boolean.TRUE.equals(columnHasDictMap.get(col)); + // Dict columns have no raw forward index; report -1 to distinguish from 0-size raw columns + long uncompressed = (hasDictionary && accum[0] == 0) ? -1 : accum[0]; + long compressed = accum[1]; + double ratio = (uncompressed > 0 && compressed > 0) ? (double) uncompressed / compressed : 0; + Set idxNames = columnIndexNamesMap.get(col); + List indexes = idxNames != null ? new ArrayList<>(idxNames) : null; + columnCompressionStats.add(new ColumnCompressionStatsInfo( + col, uncompressed, compressed, ratio, columnCodecMap.get(col), hasDictionary, indexes)); + // Only include raw columns in the table-level summary + if (!hasDictionary && uncompressed > 0) { + totalRaw += uncompressed; + totalCompressed += compressed; + } + } + columnCompressionStats.sort((a, b) -> a.getColumn().compareTo(b.getColumn())); + // Build table-level compression summary (null if no raw columns have stats) + if (totalRaw > 0 || totalCompressed > 0) { + double summaryRatio = totalCompressed > 0 ? (double) totalRaw / totalCompressed : 0; + boolean isPartialCoverage = segmentsWithStats < totalSegmentCount; + compressionStatsSummary = new CompressionStatsSummary(totalRaw, totalCompressed, summaryRatio, + segmentsWithStats, totalSegmentCount, isPartialCoverage); + } + } + + // Build storage breakdown from tier data accumulated inside the try block + StorageBreakdownInfo storageBreakdownInfo = null; + if (!tierAccum.isEmpty()) { + Map tiers = new HashMap<>(); + for (Map.Entry entry : tierAccum.entrySet()) { + tiers.put(entry.getKey(), new StorageBreakdownInfo.TierInfo((int) entry.getValue()[0], entry.getValue()[1])); + } + storageBreakdownInfo = new StorageBreakdownInfo(tiers); + } + TableMetadataInfo tableMetadataInfo = new TableMetadataInfo(tableDataManager.getTableName(), totalSegmentSizeBytes, segmentDataManagers.size(), totalNumRows, columnLengthMap, columnCardinalityMap, maxNumMultiValuesMap, columnIndexSizesMap, - partitionToServerPrimaryKeyCountMap); + partitionToServerPrimaryKeyCountMap, columnCompressionStats, compressionStatsSummary, + storageBreakdownInfo); return ResourceUtils.convertToJsonString(tableMetadataInfo); } diff --git a/pinot-server/src/test/java/org/apache/pinot/server/api/TableSizeResourceTest.java b/pinot-server/src/test/java/org/apache/pinot/server/api/TableSizeResourceTest.java index ed2b7c727349..0bd642716941 100644 --- a/pinot-server/src/test/java/org/apache/pinot/server/api/TableSizeResourceTest.java +++ b/pinot-server/src/test/java/org/apache/pinot/server/api/TableSizeResourceTest.java @@ -19,6 +19,7 @@ package org.apache.pinot.server.api; import javax.ws.rs.core.Response; +import org.apache.pinot.common.restlet.resources.SegmentSizeInfo; import org.apache.pinot.common.restlet.resources.TableSizeInfo; import org.apache.pinot.segment.spi.ImmutableSegment; import org.testng.Assert; @@ -83,4 +84,19 @@ private void verifyTableSizeOldImpl(String expectedTableName, ImmutableSegment s Assert.assertEquals(tableSizeInfo.getSegments().get(0).getDiskSizeInBytes(), segment.getSegmentSizeBytes()); Assert.assertEquals(tableSizeInfo.getDiskSizeInBytes(), segment.getSegmentSizeBytes()); } + + @Test + public void testTableSizeDetailedCompressionStatsDisabled() { + String path = "/tables/" + OFFLINE_TABLE_NAME + "/size"; + TableSizeInfo tableSizeInfo = _webTarget.path(path).request().get(TableSizeInfo.class); + + Assert.assertNotNull(tableSizeInfo); + Assert.assertTrue(tableSizeInfo.getDiskSizeInBytes() > 0); + + Assert.assertNotNull(tableSizeInfo.getSegments()); + for (SegmentSizeInfo segmentSizeInfo : tableSizeInfo.getSegments()) { + Assert.assertNotNull(segmentSizeInfo.getSegmentName()); + Assert.assertNull(segmentSizeInfo.getColumnCompressionStats()); + } + } } diff --git a/pinot-server/src/test/java/org/apache/pinot/server/api/TablesResourceTest.java b/pinot-server/src/test/java/org/apache/pinot/server/api/TablesResourceTest.java index f43c2f52875f..ca3f99fcaec6 100644 --- a/pinot-server/src/test/java/org/apache/pinot/server/api/TablesResourceTest.java +++ b/pinot-server/src/test/java/org/apache/pinot/server/api/TablesResourceTest.java @@ -22,6 +22,8 @@ import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -30,6 +32,7 @@ import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.commons.io.FileUtils; import org.apache.pinot.common.response.server.TableIndexMetadataResponse; +import org.apache.pinot.common.restlet.resources.ColumnCompressionStatsInfo; import org.apache.pinot.common.restlet.resources.TableMetadataInfo; import org.apache.pinot.common.restlet.resources.TableSegments; import org.apache.pinot.common.restlet.resources.TablesList; @@ -38,19 +41,28 @@ import org.apache.pinot.common.utils.RoaringBitmapUtils; import org.apache.pinot.common.utils.TarCompressionUtils; import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentImpl; +import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader; +import org.apache.pinot.segment.local.segment.creator.SegmentTestUtils; +import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; import org.apache.pinot.segment.local.upsert.PartitionUpsertMetadataManager; import org.apache.pinot.segment.spi.ImmutableSegment; import org.apache.pinot.segment.spi.IndexSegment; import org.apache.pinot.segment.spi.SegmentMetadata; import org.apache.pinot.segment.spi.V1Constants; +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; import org.apache.pinot.segment.spi.datasource.DataSource; import org.apache.pinot.segment.spi.index.IndexService; import org.apache.pinot.segment.spi.index.StandardIndexes; import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; import org.apache.pinot.segment.spi.index.mutable.ThreadSafeMutableRoaringBitmap; import org.apache.pinot.segment.spi.store.SegmentDirectoryPaths; +import org.apache.pinot.spi.config.table.IndexingConfig; +import org.apache.pinot.spi.config.table.TableConfig; import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.Schema; import org.apache.pinot.spi.utils.JsonUtils; +import org.apache.pinot.spi.utils.ReadMode; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; import org.apache.pinot.spi.utils.builder.TableNameBuilder; import org.roaringbitmap.buffer.ImmutableRoaringBitmap; import org.testng.Assert; @@ -761,6 +773,103 @@ public void testOfflineTableSegmentMetadata() Assert.assertEquals(response.getStatus(), Response.Status.NOT_FOUND.getStatusCode()); } + @Test + public void testGetTableMetadataCompressionStatsDisabled() + throws Exception { + String tableMetadataPath = "/tables/" + OFFLINE_TABLE_NAME + "/metadata"; + + JsonNode jsonResponse = JsonUtils.stringToJsonNode(_webTarget.path(tableMetadataPath) + .queryParam("columns", "column1") + .queryParam("columns", "column2") + .request() + .get(String.class)); + TableMetadataInfo metadataInfo = JsonUtils.jsonNodeToObject(jsonResponse, TableMetadataInfo.class); + + Assert.assertNotNull(metadataInfo); + Assert.assertNull(metadataInfo.getColumnCompressionStats(), + "columnCompressionStats should be null when compressionStatsEnabled is false"); + Assert.assertNull(metadataInfo.getCompressionStats(), + "compressionStats should be null when compressionStatsEnabled is false"); + } + + @Test + public void testGetTableMetadataHasDictionaryRawWinsOverDict() + throws Exception { + // Regression test: when a table has mixed-age segments (some dict, some raw for the same column), + // the reported hasDictionary must be false once any segment is raw (raw wins). + String mixedTableName = "mixedDictRaw_OFFLINE"; + List mixedSegments = new ArrayList<>(); + + // Segment 1: dict-encoded (default config — no noDictionaryColumns) + File tableDataDir = new File(TEMP_DIR, mixedTableName); + SegmentGeneratorConfig dictConfig = + SegmentTestUtils.getSegmentGeneratorConfigWithoutTimeColumn(_avroFile, tableDataDir, mixedTableName); + dictConfig.setSegmentNamePostfix("dict"); + SegmentIndexCreationDriverImpl dictDriver = new SegmentIndexCreationDriverImpl(); + dictDriver.init(dictConfig); + dictDriver.build(); + ImmutableSegment dictSegment = ImmutableSegmentLoader.load( + new File(tableDataDir, dictDriver.getSegmentName()), + ReadMode.mmap); + mixedSegments.add(dictSegment); + + // Segment 2: raw-encoded for column1 and column2 + IndexingConfig rawIndexingConfig = new IndexingConfig(); + rawIndexingConfig.setNoDictionaryColumns(Arrays.asList("column1", "column2")); + rawIndexingConfig.setCompressionStatsEnabled(true); + TableConfig rawTableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(mixedTableName).build(); + rawTableConfig.setIndexingConfig(rawIndexingConfig); + Schema rawSchema = SegmentTestUtils.extractSchemaFromAvroWithoutTime(_avroFile); + SegmentGeneratorConfig rawConfig = + SegmentTestUtils.getSegmentGeneratorConfigWithSchema(_avroFile, tableDataDir, mixedTableName, + rawTableConfig, rawSchema); + rawConfig.setSegmentNamePostfix("raw"); + SegmentIndexCreationDriverImpl rawDriver = new SegmentIndexCreationDriverImpl(); + rawDriver.init(rawConfig); + rawDriver.build(); + ImmutableSegment rawSegment = ImmutableSegmentLoader.load( + new File(tableDataDir, rawDriver.getSegmentName()), + ReadMode.mmap); + mixedSegments.add(rawSegment); + + // Register the table with compressionStatsEnabled=true + addTable(mixedTableName); + IndexingConfig tableIndexingConfig = new IndexingConfig(); + tableIndexingConfig.setCompressionStatsEnabled(true); + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(mixedTableName).build(); + tableConfig.setIndexingConfig(tableIndexingConfig); + _tableDataManagerMap.get(mixedTableName).updateCachedTableConfigAndSchema(tableConfig, null); + for (ImmutableSegment seg : mixedSegments) { + _tableDataManagerMap.get(mixedTableName).addSegment(seg); + } + + try { + JsonNode jsonResponse = JsonUtils.stringToJsonNode(_webTarget + .path("/tables/" + mixedTableName + "/metadata") + .queryParam("columns", "column1") + .queryParam("columns", "column2") + .request() + .get(String.class)); + TableMetadataInfo metadataInfo = JsonUtils.jsonNodeToObject(jsonResponse, TableMetadataInfo.class); + + Assert.assertNotNull(metadataInfo); + List ccs = metadataInfo.getColumnCompressionStats(); + Assert.assertNotNull(ccs, "columnCompressionStats should be present when flag=ON and segments have stats"); + for (ColumnCompressionStatsInfo colStats : ccs) { + if ("column1".equals(colStats.getColumn()) || "column2".equals(colStats.getColumn())) { + Assert.assertFalse(colStats.hasDictionary(), + "column " + colStats.getColumn() + " should report hasDictionary=false (raw wins over dict)"); + } + } + } finally { + for (ImmutableSegment seg : mixedSegments) { + seg.offload(); + seg.destroy(); + } + _tableDataManagerMap.remove(mixedTableName); + } + } + // Override to use data with delete records @Override protected String getAvroFileName() { diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java index 6a927ded9551..5927f057d9d0 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java @@ -112,6 +112,8 @@ public class IndexingConfig extends BaseJsonConfig { private MultiColumnTextIndexConfig _multiColumnTextIndexConfig; + private boolean _compressionStatsEnabled; + @Nullable public List getInvertedIndexColumns() { return _invertedIndexColumns; @@ -420,6 +422,14 @@ public void setMultiColumnTextIndexConfig(MultiColumnTextIndexConfig multiColumn _multiColumnTextIndexConfig = multiColumnTextIndexConfig; } + public boolean isCompressionStatsEnabled() { + return _compressionStatsEnabled; + } + + public void setCompressionStatsEnabled(boolean compressionStatsEnabled) { + _compressionStatsEnabled = compressionStatsEnabled; + } + /** * Returns all columns referenced in the indexing config. This is useful to construct FieldIndexConfigs in * IndexLoadingConfig when schema is not provided. Only including the columns referenced by indexes supported in diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/config/table/IndexingConfigCompressionFlagTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/config/table/IndexingConfigCompressionFlagTest.java new file mode 100644 index 000000000000..119f4078222a --- /dev/null +++ b/pinot-spi/src/test/java/org/apache/pinot/spi/config/table/IndexingConfigCompressionFlagTest.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.spi.config.table; + +import org.apache.pinot.spi.utils.JsonUtils; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + + +/** + * Tests for the {@code compressionStatsEnabled} field on {@link IndexingConfig}. + * Covers T002: serialization round-trip, default value, and backward-compatible deserialization. + */ +public class IndexingConfigCompressionFlagTest { + + @Test + public void testDefaultValueIsFalse() { + IndexingConfig config = new IndexingConfig(); + assertFalse(config.isCompressionStatsEnabled(), + "Default value of compressionStatsEnabled should be false"); + } + + @Test + public void testSetAndGet() { + IndexingConfig config = new IndexingConfig(); + config.setCompressionStatsEnabled(true); + assertTrue(config.isCompressionStatsEnabled(), + "compressionStatsEnabled should be true after setting it to true"); + } + + @Test + public void testJsonSerializationRoundTrip() + throws Exception { + IndexingConfig original = new IndexingConfig(); + original.setCompressionStatsEnabled(true); + + String json = JsonUtils.objectToString(original); + IndexingConfig deserialized = JsonUtils.stringToObject(json, IndexingConfig.class); + + assertTrue(deserialized.isCompressionStatsEnabled(), + "compressionStatsEnabled should remain true after JSON round-trip serialization"); + } + + @Test + public void testBackwardCompatDeserialization() + throws Exception { + // JSON that does not contain the compressionStatsEnabled field, simulating an old config + String oldConfigJson = "{\"loadMode\":\"MMAP\"}"; + + IndexingConfig config = JsonUtils.stringToObject(oldConfigJson, IndexingConfig.class); + + assertFalse(config.isCompressionStatsEnabled(), + "compressionStatsEnabled should default to false when missing from JSON (backward compatibility)"); + } +}