Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Supplier;

import com.google.common.collect.BoundType;
import com.google.common.collect.Range;
Expand All @@ -44,9 +45,12 @@
import org.apache.calcite.rex.RexUtil;
import org.apache.calcite.rex.RexVisitorImpl;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.calcite.sql.type.SqlTypeUtil;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.calcite.util.RangeSets;
import org.apache.calcite.util.Sarg;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
Expand All @@ -66,7 +70,7 @@ public class FilterSelectivityEstimator extends RexVisitorImpl<Double> {
protected static final Logger LOG = LoggerFactory.getLogger(FilterSelectivityEstimator.class);

private final RelNode childRel;
private final double childCardinality;
private final double childCardinality;
private final RelMetadataQuery mq;
private final RexBuilder rexBuilder;

Expand Down Expand Up @@ -113,8 +117,6 @@ public Double visitCall(RexCall call) {
selectivity = computeConjunctionSelectivity(call);
break;
}
case SEARCH:
return new SearchTransformer<>(rexBuilder, call, RexUnknownAs.FALSE).transform().accept(this);
case OR: {
selectivity = computeDisjunctionSelectivity(call);
break;
Expand Down Expand Up @@ -159,14 +161,18 @@ public Double visitCall(RexCall call) {
case GREATER_THAN_OR_EQUAL:
case LESS_THAN:
case GREATER_THAN: {
selectivity = computeRangePredicateSelectivity(call, call.getKind());
selectivity = computeComparisonPredicateSelectivity(call, call.getKind());
break;
}

case BETWEEN:
selectivity = computeBetweenPredicateSelectivity(call);
break;

case SEARCH:
selectivity = computeSearchSelectivity(call);
break;

Comment thread
rubenada marked this conversation as resolved.
Outdated
default:
if (HiveIn.INSTANCE.equals(call.op)) {
// TODO: 1) check for duplicates 2) We assume in clause values to be
Expand Down Expand Up @@ -226,7 +232,7 @@ public Double visitCall(RexCall call) {
* @return true if the expression is a removable cast, false otherwise
*/
private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) {
if(SqlKind.CAST != exp.getKind()) {
if (SqlKind.CAST != exp.getKind()) {
Comment thread
rubenada marked this conversation as resolved.
Outdated
return false;
}
RexCall cast = (RexCall) exp;
Expand Down Expand Up @@ -405,8 +411,10 @@ private static Range<Float> makeRange(float lower, float upper, BoundType upperT
return lower > upper ? Range.closedOpen(0f, 0f) : Range.range(lower, BoundType.CLOSED, upper, upperType);
}

private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
double defaultSelectivity = ((double) 1 / (double) 3);
private static final Double DEFAULT_COMPARISON_SELECTIVITY = ((double) 1 / (double) 3);
Comment thread
rubenada marked this conversation as resolved.
Outdated

private double computeComparisonPredicateSelectivity(RexCall call, SqlKind op) {
double defaultSelectivity = DEFAULT_COMPARISON_SELECTIVITY;
if (!(childRel instanceof HiveTableScan)) {
return defaultSelectivity;
}
Expand Down Expand Up @@ -440,34 +448,56 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
boundaryValues[boundaryIdx] = value;
inclusive[boundaryIdx] = openBound ? BoundType.OPEN : BoundType.CLOSED;
Range<Float> boundaries = Range.range(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1]);

// extract the column index from the other operator
final HiveTableScan scan = (HiveTableScan) childRel;
int inputRefOpIndex = 1 - literalOpIdx;
RexNode node = operands.get(inputRefOpIndex);
if (isRemovableCast(node, scan)) {
Range<Float> typeRange = getRangeOfType(node.getType());
boundaries = adjustRangeToType(boundaries, node.getType(), typeRange);
return computeRangePredicateSelectivity(() -> defaultSelectivity, node, boundaries);
}

private double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand,
Range<Float> boundaries) {
return computeRangePredicateSelectivity(defaultSelectivity, operand, boundaries, false);
}

/**
* Computes the selectivity of an operand in a certain range trying to leverage the histogram information.
* Returns the default selectivity if the histogram is not available.
*/
private double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand,
Range<Float> boundaries, boolean inverseBool /* true only for NOT_BETWEEN */ ) {
if (!(childRel instanceof HiveTableScan)) {
return defaultSelectivity.get();
}

node = RexUtil.removeCast(node);
final HiveTableScan scan = (HiveTableScan) childRel;
Range<Float> typeRange = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null;
if (isRemovableCast(operand, scan)) {
typeRange = getRangeOfType(operand.getType());
boundaries = adjustRangeToType(boundaries, operand.getType(), typeRange);
operand = RexUtil.removeCast(operand);
}

int inputRefIndex = -1;
if (node.getKind().equals(SqlKind.INPUT_REF)) {
inputRefIndex = ((RexInputRef) node).getIndex();
if (operand.getKind().equals(SqlKind.INPUT_REF)) {
inputRefIndex = ((RexInputRef) operand).getIndex();
}

if (inputRefIndex < 0) {
return defaultSelectivity;
return defaultSelectivity.get();
}

final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex));
if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) {
return defaultSelectivity;
return defaultSelectivity.get();
}

final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
double rawSelectivity = rangedSelectivity(kll, boundaries);
if (inverseBool) {
// when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
// if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
double typeRangeSelectivity = rangedSelectivity(kll, typeRange);
rawSelectivity = typeRangeSelectivity - rawSelectivity;
}
return scaleSelectivityToNullableValues(kll, rawSelectivity, scan);
}

Expand Down Expand Up @@ -511,7 +541,6 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
Optional<Float> rightLiteral = extractLiteral(operands.get(3));

if (hasLiteralBool && leftLiteral.isPresent() && rightLiteral.isPresent()) {
final HiveTableScan scan = (HiveTableScan) childRel;
float leftValue = leftLiteral.get();
float rightValue = rightLiteral.get();

Expand All @@ -522,36 +551,9 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
}

Range<Float> rangeBoundaries = makeRange(leftValue, rightValue, BoundType.CLOSED);
Range<Float> typeBoundaries = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null;

RexNode expr = operands.get(1); // expr to be checked by the BETWEEN
if (isRemovableCast(expr, scan)) {
typeBoundaries = getRangeOfType(expr.getType());
rangeBoundaries = adjustRangeToType(rangeBoundaries, expr.getType(), typeBoundaries);
expr = RexUtil.removeCast(expr);
}

int inputRefIndex = -1;
if (expr.getKind().equals(SqlKind.INPUT_REF)) {
inputRefIndex = ((RexInputRef) expr).getIndex();
}

if (inputRefIndex < 0) {
return computeFunctionSelectivity(call);
}

final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex));
if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
double rawSelectivity = rangedSelectivity(kll, rangeBoundaries);
if (inverseBool) {
// when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
// if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
double typeRangeSelectivity = rangedSelectivity(kll, typeBoundaries);
rawSelectivity = typeRangeSelectivity - rawSelectivity;
}
return scaleSelectivityToNullableValues(kll, rawSelectivity, scan);
}
return computeRangePredicateSelectivity(() -> computeFunctionSelectivity(call), expr, rangeBoundaries,
inverseBool);
}
return computeFunctionSelectivity(call);
}
Expand Down Expand Up @@ -603,6 +605,151 @@ private Optional<Float> extractLiteral(SqlTypeName typeName, Object boundValueOb
return Optional.of(value);
}

private double computeSearchSelectivity(RexCall search) {
return new SearchSelectivityHelper<>(search).compute();
}

/**
* Similar to {@link SearchTransformer}, but computing the selectivity of the expression.
*/
private class SearchSelectivityHelper<C extends Comparable<C>> {
private final RexNode ref;
private final Sarg<C> sarg;
private final RelDataType operandType;

private SearchSelectivityHelper(RexCall search) {
ref = search.getOperands().get(0);
RexLiteral literal = (RexLiteral) search.operands.get(1);
sarg = Objects.requireNonNull(literal.getValueAs(Sarg.class), "Sarg");
operandType = literal.getType();
}

private RexNode makeLiteral(C value) {
return rexBuilder.makeLiteral(value, operandType, true, true);
}

private double compute() {
final List<Double> selectivityList = new ArrayList<>();
final List<RexNode> inLiterals = new ArrayList<>();

if (sarg.nullAs == RexUnknownAs.TRUE) {
selectivityList.add(
rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref).accept(FilterSelectivityEstimator.this));
}

RangeSets.forEach(sarg.rangeSet, new RangeSets.Consumer<C>() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the code could be simplified with for (Range<C> range : sarg.rangeSet.asRanges()) { ... }? It might be possible to treat a range as-is, without differentiating all the different kinds of ranges.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I have applied the suggested refactoring.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I agree that the refactored code is much smaller/simplified now, I feel the previous version was more organized and readable as it's now a huge method with multiple if else blocks.

Moreover, I see that implementing RangeSets.Consumer<C> is the preferred method both in Hive (org.apache.hadoop.hive.ql.optimizer.calcite.RangeConverter) and in Calcite (several places). If the new code is not significantly more performant than the earlier version, then maybe we should keep things familiar?

Another small benefit of implementing RangeSets.Consumer<C> is it will be easily searchable from IDE by looking for all subclasses.

BTW, I am willing to approve this as-is, but just wanted to hear both of your thoughts on this.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no strong opinion here.
I agree that the original version with the RangeSets.Consumer<C> seemed a bit better in terms of code homogeneity and maintainability (easier to find if we ever need to apply adjustments to all rangeSet processing, like the separate, incoming HIVE-28911).
So I'm willing to move back to the Consumer approach, wdyt @thomasrebele ?

Copy link
Copy Markdown
Contributor

@thomasrebele thomasrebele May 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a few places in Calcite that iterate over sarg.rangeSet.asRanges() without the Consumer:

The places where a RangeSets.Consumer<C> is used in Calcite, there is an easy mapping from the different range types to a distinct action. Hive always uses the sarg.rangeSet with a RangeSets.Consumer. However, I could only find one usage, and it was introduced by @soumyakanti3578, so I'm not sure whether the opinion is unbiased :) The usages (including those from the Consumer) can be found in the IDE by looking at the usages of org.apache.calcite.util.Sarg#rangeSet.

I had a try simplifying the code a bit, see thomasrebele@29cb98b. It's a bit less efficient than Ruben's proposal. It might be a bit more readable.

Copy link
Copy Markdown
Contributor

@soumyakanti3578 soumyakanti3578 May 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks guys for your thoughts on this. Since this is a fairly localized change, either approach is fine, so we can keep the code as-is.
Thanks Thomas for the suggestion. It definitely looks cleaner but I agree that it's probably less efficient than the current implementation, so maybe we can keep the current version, or I will leave it up to Ruben.

P.S.: I can't take all the credit for the changes in RangeConverter as that was a group effort :D

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review @soumyakanti3578 and @thomasrebele ! I have pushed some minor changes, including the approach proposed by @thomasrebele , which makes the code less complex.

@Override
public void all() {
selectivityList.add(1.0);
}

@Override
public void singleton(C value) {
inLiterals.add(rexBuilder.makeLiteral(value, operandType, true, true));
}

@Override
public void atLeast(C lower) {
Optional<Float> lowerLiteral = extractLiteral(makeLiteral(lower));
if (lowerLiteral.isEmpty()) {
selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
} else {
processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
Range.range(lowerLiteral.get(), BoundType.CLOSED, Float.POSITIVE_INFINITY, BoundType.CLOSED));
}
}

@Override
public void atMost(C upper) {
Optional<Float> upperLiteral = extractLiteral(makeLiteral(upper));
if (upperLiteral.isEmpty()) {
selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
} else {
processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
Range.range(Float.NEGATIVE_INFINITY, BoundType.CLOSED, upperLiteral.get(), BoundType.CLOSED));
}
}

@Override
public void greaterThan(C lower) {
Optional<Float> lowerLiteral = extractLiteral(makeLiteral(lower));
if (lowerLiteral.isEmpty()) {
selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
} else {
processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
Range.range(lowerLiteral.get(), BoundType.OPEN, Float.POSITIVE_INFINITY, BoundType.CLOSED));
}
}

@Override
public void lessThan(C upper) {
Optional<Float> upperLiteral = extractLiteral(makeLiteral(upper));
if (upperLiteral.isEmpty()) {
selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY);
} else {
processRange(() -> DEFAULT_COMPARISON_SELECTIVITY,
Range.range(Float.NEGATIVE_INFINITY, BoundType.CLOSED, upperLiteral.get(), BoundType.OPEN));
}
}

@Override
public void closed(C lower, C upper) {
processRange(lower, BoundType.CLOSED, upper, BoundType.CLOSED);
}

@Override
public void closedOpen(C lower, C upper) {
processRange(lower, BoundType.CLOSED, upper, BoundType.OPEN);
}

@Override
public void openClosed(C lower, C upper) {
processRange(lower, BoundType.OPEN, upper, BoundType.CLOSED);
}

@Override
public void open(C lower, C upper) {
processRange(lower, BoundType.OPEN, upper, BoundType.OPEN);
}

private void processRange(C lower, BoundType lowerBoundType, C upper, BoundType upperBoundType) {
RexNode lowerRexLiteral = makeLiteral(lower);
RexNode upperRexLiteral = makeLiteral(upper);
Supplier<Double> defaultSelectivity =
() -> computeFunctionSelectivity(List.of(ref, lowerRexLiteral, upperRexLiteral));
Optional<Float> lowerLiteral = extractLiteral(lowerRexLiteral);
Optional<Float> upperLiteral = extractLiteral(upperRexLiteral);
if (lowerLiteral.isEmpty() || upperLiteral.isEmpty()) {
selectivityList.add(defaultSelectivity.get());
} else {
processRange(defaultSelectivity,
Range.range(lowerLiteral.get(), lowerBoundType, upperLiteral.get(), upperBoundType));
}
}

private void processRange(Supplier<Double> defaultSelectivity, Range<Float> boundaries) {
selectivityList.add(computeRangePredicateSelectivity(defaultSelectivity, ref, boundaries));
}
});

switch (inLiterals.size()) {
case 0:
break;
case 1:
selectivityList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, inLiterals.get(0))
.accept(FilterSelectivityEstimator.this));
break;
default:
List<RexNode> operands = new ArrayList<>(inLiterals.size() + 1);
operands.add(ref);
operands.addAll(inLiterals);
selectivityList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this));
}

return selectivityList.size() == 1 ? selectivityList.get(0) : computeDisjunctionSelectivity(selectivityList);
Comment thread
rubenada marked this conversation as resolved.
Outdated
}
}

/**
* NDV of "f1(x, y, z) != f2(p, q, r)" ->
* "(maxNDV(x,y,z,p,q,r) - 1)/maxNDV(x,y,z,p,q,r)".
Expand Down Expand Up @@ -633,7 +780,11 @@ private Double computeNotEqualitySelectivity(RexCall call) {
* @return
*/
private Double computeFunctionSelectivity(RexCall call) {
Double tmpNDV = getMaxNDV(call);
return computeFunctionSelectivity(call.getOperands());
}

private Double computeFunctionSelectivity(List<RexNode> operands) {
Double tmpNDV = getMaxNDV(operands);
if (tmpNDV == null) {
// Could not be computed
return null;
Expand All @@ -653,12 +804,20 @@ private Double computeFunctionSelectivity(RexCall call) {
* @return
*/
private Double computeDisjunctionSelectivity(RexCall call) {
List<Double> selectivityList = new ArrayList<>(call.getOperands().size());
for (RexNode dje : call.getOperands()) {
selectivityList.add(dje.accept(this));
}
return computeDisjunctionSelectivity(selectivityList);
}

private double computeDisjunctionSelectivity(List<Double> selectivityList) {
Double tmpCardinality;
Double tmpSelectivity;
double selectivity = 1;

for (RexNode dje : call.getOperands()) {
tmpSelectivity = dje.accept(this);
for (Double sel : selectivityList) {
tmpSelectivity = sel;
if (tmpSelectivity == null) {
tmpSelectivity = 0.99;
}
Expand Down Expand Up @@ -729,10 +888,14 @@ private long getMaxNulls(RexCall call, HiveTableScan t) {
}

private Double getMaxNDV(RexCall call) {
return getMaxNDV(call.getOperands());
}

private Double getMaxNDV(List<RexNode> operands) {
Double tmpNDV;
double maxNDV = 1.0;
InputReferencedVisitor irv;
for (RexNode op : call.getOperands()) {
for (RexNode op : operands) {
if (op instanceof RexInputRef) {
tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(this.childRel, mq,
((RexInputRef) op).getIndex());
Expand Down
Loading
Loading