-
Notifications
You must be signed in to change notification settings - Fork 4.8k
HIVE-29479: Improve histogram-based selectivity estimation for two-sided range predicates #6477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
75980cd
5164caf
0d9a543
294bf14
fad09cf
47366b3
470bf75
06f48c6
d7dee73
a0ad27e
5f0a503
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| import java.util.Objects; | ||
| import java.util.Optional; | ||
| import java.util.Set; | ||
| import java.util.function.Supplier; | ||
|
|
||
| import com.google.common.collect.BoundType; | ||
| import com.google.common.collect.Range; | ||
|
|
@@ -44,9 +45,12 @@ | |
| import org.apache.calcite.rex.RexUtil; | ||
| import org.apache.calcite.rex.RexVisitorImpl; | ||
| import org.apache.calcite.sql.SqlKind; | ||
| import org.apache.calcite.sql.fun.SqlStdOperatorTable; | ||
| import org.apache.calcite.sql.type.SqlTypeName; | ||
| import org.apache.calcite.sql.type.SqlTypeUtil; | ||
| import org.apache.calcite.util.ImmutableBitSet; | ||
| import org.apache.calcite.util.RangeSets; | ||
| import org.apache.calcite.util.Sarg; | ||
| import org.apache.datasketches.kll.KllFloatsSketch; | ||
| import org.apache.datasketches.memory.Memory; | ||
| import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; | ||
|
|
@@ -66,7 +70,7 @@ public class FilterSelectivityEstimator extends RexVisitorImpl<Double> { | |
| protected static final Logger LOG = LoggerFactory.getLogger(FilterSelectivityEstimator.class); | ||
|
|
||
| private final RelNode childRel; | ||
| private final double childCardinality; | ||
| private final double childCardinality; | ||
| private final RelMetadataQuery mq; | ||
| private final RexBuilder rexBuilder; | ||
|
|
||
|
|
@@ -113,8 +117,6 @@ public Double visitCall(RexCall call) { | |
| selectivity = computeConjunctionSelectivity(call); | ||
| break; | ||
| } | ||
| case SEARCH: | ||
| return new SearchTransformer<>(rexBuilder, call, RexUnknownAs.FALSE).transform().accept(this); | ||
| case OR: { | ||
| selectivity = computeDisjunctionSelectivity(call); | ||
| break; | ||
|
|
@@ -159,14 +161,18 @@ public Double visitCall(RexCall call) { | |
| case GREATER_THAN_OR_EQUAL: | ||
| case LESS_THAN: | ||
| case GREATER_THAN: { | ||
| selectivity = computeRangePredicateSelectivity(call, call.getKind()); | ||
| selectivity = computeComparisonPredicateSelectivity(call, call.getKind()); | ||
| break; | ||
| } | ||
|
|
||
| case BETWEEN: | ||
| selectivity = computeBetweenPredicateSelectivity(call); | ||
| break; | ||
|
|
||
| case SEARCH: | ||
| selectivity = computeSearchSelectivity(call); | ||
| break; | ||
|
|
||
| default: | ||
| if (HiveIn.INSTANCE.equals(call.op)) { | ||
| // TODO: 1) check for duplicates 2) We assume in clause values to be | ||
|
|
@@ -226,7 +232,7 @@ public Double visitCall(RexCall call) { | |
| * @return true if the expression is a removable cast, false otherwise | ||
| */ | ||
| private boolean isRemovableCast(RexNode exp, HiveTableScan tableScan) { | ||
| if(SqlKind.CAST != exp.getKind()) { | ||
| if (SqlKind.CAST != exp.getKind()) { | ||
|
rubenada marked this conversation as resolved.
Outdated
|
||
| return false; | ||
| } | ||
| RexCall cast = (RexCall) exp; | ||
|
|
@@ -405,8 +411,10 @@ private static Range<Float> makeRange(float lower, float upper, BoundType upperT | |
| return lower > upper ? Range.closedOpen(0f, 0f) : Range.range(lower, BoundType.CLOSED, upper, upperType); | ||
| } | ||
|
|
||
| private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { | ||
| double defaultSelectivity = ((double) 1 / (double) 3); | ||
| private static final Double DEFAULT_COMPARISON_SELECTIVITY = ((double) 1 / (double) 3); | ||
|
rubenada marked this conversation as resolved.
Outdated
|
||
|
|
||
| private double computeComparisonPredicateSelectivity(RexCall call, SqlKind op) { | ||
| double defaultSelectivity = DEFAULT_COMPARISON_SELECTIVITY; | ||
| if (!(childRel instanceof HiveTableScan)) { | ||
| return defaultSelectivity; | ||
| } | ||
|
|
@@ -440,34 +448,56 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) { | |
| boundaryValues[boundaryIdx] = value; | ||
| inclusive[boundaryIdx] = openBound ? BoundType.OPEN : BoundType.CLOSED; | ||
| Range<Float> boundaries = Range.range(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1]); | ||
|
|
||
| // extract the column index from the other operator | ||
| final HiveTableScan scan = (HiveTableScan) childRel; | ||
| int inputRefOpIndex = 1 - literalOpIdx; | ||
| RexNode node = operands.get(inputRefOpIndex); | ||
| if (isRemovableCast(node, scan)) { | ||
| Range<Float> typeRange = getRangeOfType(node.getType()); | ||
| boundaries = adjustRangeToType(boundaries, node.getType(), typeRange); | ||
| return computeRangePredicateSelectivity(() -> defaultSelectivity, node, boundaries); | ||
| } | ||
|
|
||
| private double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand, | ||
| Range<Float> boundaries) { | ||
| return computeRangePredicateSelectivity(defaultSelectivity, operand, boundaries, false); | ||
| } | ||
|
|
||
| /** | ||
| * Computes the selectivity of an operand in a certain range trying to leverage the histogram information. | ||
| * Returns the default selectivity if the histogram is not available. | ||
| */ | ||
| private double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand, | ||
| Range<Float> boundaries, boolean inverseBool /* true only for NOT_BETWEEN */ ) { | ||
| if (!(childRel instanceof HiveTableScan)) { | ||
| return defaultSelectivity.get(); | ||
| } | ||
|
|
||
| node = RexUtil.removeCast(node); | ||
| final HiveTableScan scan = (HiveTableScan) childRel; | ||
| Range<Float> typeRange = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null; | ||
| if (isRemovableCast(operand, scan)) { | ||
| typeRange = getRangeOfType(operand.getType()); | ||
| boundaries = adjustRangeToType(boundaries, operand.getType(), typeRange); | ||
| operand = RexUtil.removeCast(operand); | ||
| } | ||
|
|
||
| int inputRefIndex = -1; | ||
| if (node.getKind().equals(SqlKind.INPUT_REF)) { | ||
| inputRefIndex = ((RexInputRef) node).getIndex(); | ||
| if (operand.getKind().equals(SqlKind.INPUT_REF)) { | ||
| inputRefIndex = ((RexInputRef) operand).getIndex(); | ||
| } | ||
|
|
||
| if (inputRefIndex < 0) { | ||
| return defaultSelectivity; | ||
| return defaultSelectivity.get(); | ||
| } | ||
|
|
||
| final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex)); | ||
| if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) { | ||
| return defaultSelectivity; | ||
| return defaultSelectivity.get(); | ||
| } | ||
|
|
||
| final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); | ||
| double rawSelectivity = rangedSelectivity(kll, boundaries); | ||
| if (inverseBool) { | ||
| // when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted | ||
| // if there's a cast, the inversion is with respect to its codomain (range of the values of the cast) | ||
| double typeRangeSelectivity = rangedSelectivity(kll, typeRange); | ||
| rawSelectivity = typeRangeSelectivity - rawSelectivity; | ||
| } | ||
| return scaleSelectivityToNullableValues(kll, rawSelectivity, scan); | ||
| } | ||
|
|
||
|
|
@@ -511,7 +541,6 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { | |
| Optional<Float> rightLiteral = extractLiteral(operands.get(3)); | ||
|
|
||
| if (hasLiteralBool && leftLiteral.isPresent() && rightLiteral.isPresent()) { | ||
| final HiveTableScan scan = (HiveTableScan) childRel; | ||
| float leftValue = leftLiteral.get(); | ||
| float rightValue = rightLiteral.get(); | ||
|
|
||
|
|
@@ -522,36 +551,9 @@ private Double computeBetweenPredicateSelectivity(RexCall call) { | |
| } | ||
|
|
||
| Range<Float> rangeBoundaries = makeRange(leftValue, rightValue, BoundType.CLOSED); | ||
| Range<Float> typeBoundaries = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null; | ||
|
|
||
| RexNode expr = operands.get(1); // expr to be checked by the BETWEEN | ||
| if (isRemovableCast(expr, scan)) { | ||
| typeBoundaries = getRangeOfType(expr.getType()); | ||
| rangeBoundaries = adjustRangeToType(rangeBoundaries, expr.getType(), typeBoundaries); | ||
| expr = RexUtil.removeCast(expr); | ||
| } | ||
|
|
||
| int inputRefIndex = -1; | ||
| if (expr.getKind().equals(SqlKind.INPUT_REF)) { | ||
| inputRefIndex = ((RexInputRef) expr).getIndex(); | ||
| } | ||
|
|
||
| if (inputRefIndex < 0) { | ||
| return computeFunctionSelectivity(call); | ||
| } | ||
|
|
||
| final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex)); | ||
| if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) { | ||
| final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram())); | ||
| double rawSelectivity = rangedSelectivity(kll, rangeBoundaries); | ||
| if (inverseBool) { | ||
| // when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted | ||
| // if there's a cast, the inversion is with respect to its codomain (range of the values of the cast) | ||
| double typeRangeSelectivity = rangedSelectivity(kll, typeBoundaries); | ||
| rawSelectivity = typeRangeSelectivity - rawSelectivity; | ||
| } | ||
| return scaleSelectivityToNullableValues(kll, rawSelectivity, scan); | ||
| } | ||
| return computeRangePredicateSelectivity(() -> computeFunctionSelectivity(call), expr, rangeBoundaries, | ||
| inverseBool); | ||
| } | ||
| return computeFunctionSelectivity(call); | ||
| } | ||
|
|
@@ -603,6 +605,151 @@ private Optional<Float> extractLiteral(SqlTypeName typeName, Object boundValueOb | |
| return Optional.of(value); | ||
| } | ||
|
|
||
| private double computeSearchSelectivity(RexCall search) { | ||
| return new SearchSelectivityHelper<>(search).compute(); | ||
| } | ||
|
|
||
| /** | ||
| * Similar to {@link SearchTransformer}, but computing the selectivity of the expression. | ||
| */ | ||
| private class SearchSelectivityHelper<C extends Comparable<C>> { | ||
| private final RexNode ref; | ||
| private final Sarg<C> sarg; | ||
| private final RelDataType operandType; | ||
|
|
||
| private SearchSelectivityHelper(RexCall search) { | ||
| ref = search.getOperands().get(0); | ||
| RexLiteral literal = (RexLiteral) search.operands.get(1); | ||
| sarg = Objects.requireNonNull(literal.getValueAs(Sarg.class), "Sarg"); | ||
| operandType = literal.getType(); | ||
| } | ||
|
|
||
| private RexNode makeLiteral(C value) { | ||
| return rexBuilder.makeLiteral(value, operandType, true, true); | ||
| } | ||
|
|
||
| private double compute() { | ||
| final List<Double> selectivityList = new ArrayList<>(); | ||
| final List<RexNode> inLiterals = new ArrayList<>(); | ||
|
|
||
| if (sarg.nullAs == RexUnknownAs.TRUE) { | ||
| selectivityList.add( | ||
| rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref).accept(FilterSelectivityEstimator.this)); | ||
| } | ||
|
|
||
| RangeSets.forEach(sarg.rangeSet, new RangeSets.Consumer<C>() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe the code could be simplified with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point! I have applied the suggested refactoring.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While I agree that the refactored code is much smaller/simplified now, I feel the previous version was more organized and readable as it's now a huge method with multiple Moreover, I see that implementing Another small benefit of implementing BTW, I am willing to approve this as-is, but just wanted to hear both of your thoughts on this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have no strong opinion here.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are a few places in Calcite that iterate over The places where a I had a try simplifying the code a bit, see thomasrebele@29cb98b. It's a bit less efficient than Ruben's proposal. It might be a bit more readable.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks guys for your thoughts on this. Since this is a fairly localized change, either approach is fine, so we can keep the code as-is. P.S.: I can't take all the credit for the changes in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review @soumyakanti3578 and @thomasrebele ! I have pushed some minor changes, including the approach proposed by @thomasrebele , which makes the code less complex. |
||
| @Override | ||
| public void all() { | ||
| selectivityList.add(1.0); | ||
| } | ||
|
|
||
| @Override | ||
| public void singleton(C value) { | ||
| inLiterals.add(rexBuilder.makeLiteral(value, operandType, true, true)); | ||
| } | ||
|
|
||
| @Override | ||
| public void atLeast(C lower) { | ||
| Optional<Float> lowerLiteral = extractLiteral(makeLiteral(lower)); | ||
| if (lowerLiteral.isEmpty()) { | ||
| selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY); | ||
| } else { | ||
| processRange(() -> DEFAULT_COMPARISON_SELECTIVITY, | ||
| Range.range(lowerLiteral.get(), BoundType.CLOSED, Float.POSITIVE_INFINITY, BoundType.CLOSED)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void atMost(C upper) { | ||
| Optional<Float> upperLiteral = extractLiteral(makeLiteral(upper)); | ||
| if (upperLiteral.isEmpty()) { | ||
| selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY); | ||
| } else { | ||
| processRange(() -> DEFAULT_COMPARISON_SELECTIVITY, | ||
| Range.range(Float.NEGATIVE_INFINITY, BoundType.CLOSED, upperLiteral.get(), BoundType.CLOSED)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void greaterThan(C lower) { | ||
| Optional<Float> lowerLiteral = extractLiteral(makeLiteral(lower)); | ||
| if (lowerLiteral.isEmpty()) { | ||
| selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY); | ||
| } else { | ||
| processRange(() -> DEFAULT_COMPARISON_SELECTIVITY, | ||
| Range.range(lowerLiteral.get(), BoundType.OPEN, Float.POSITIVE_INFINITY, BoundType.CLOSED)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void lessThan(C upper) { | ||
| Optional<Float> upperLiteral = extractLiteral(makeLiteral(upper)); | ||
| if (upperLiteral.isEmpty()) { | ||
| selectivityList.add(DEFAULT_COMPARISON_SELECTIVITY); | ||
| } else { | ||
| processRange(() -> DEFAULT_COMPARISON_SELECTIVITY, | ||
| Range.range(Float.NEGATIVE_INFINITY, BoundType.CLOSED, upperLiteral.get(), BoundType.OPEN)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void closed(C lower, C upper) { | ||
| processRange(lower, BoundType.CLOSED, upper, BoundType.CLOSED); | ||
| } | ||
|
|
||
| @Override | ||
| public void closedOpen(C lower, C upper) { | ||
| processRange(lower, BoundType.CLOSED, upper, BoundType.OPEN); | ||
| } | ||
|
|
||
| @Override | ||
| public void openClosed(C lower, C upper) { | ||
| processRange(lower, BoundType.OPEN, upper, BoundType.CLOSED); | ||
| } | ||
|
|
||
| @Override | ||
| public void open(C lower, C upper) { | ||
| processRange(lower, BoundType.OPEN, upper, BoundType.OPEN); | ||
| } | ||
|
|
||
| private void processRange(C lower, BoundType lowerBoundType, C upper, BoundType upperBoundType) { | ||
| RexNode lowerRexLiteral = makeLiteral(lower); | ||
| RexNode upperRexLiteral = makeLiteral(upper); | ||
| Supplier<Double> defaultSelectivity = | ||
| () -> computeFunctionSelectivity(List.of(ref, lowerRexLiteral, upperRexLiteral)); | ||
| Optional<Float> lowerLiteral = extractLiteral(lowerRexLiteral); | ||
| Optional<Float> upperLiteral = extractLiteral(upperRexLiteral); | ||
| if (lowerLiteral.isEmpty() || upperLiteral.isEmpty()) { | ||
| selectivityList.add(defaultSelectivity.get()); | ||
| } else { | ||
| processRange(defaultSelectivity, | ||
| Range.range(lowerLiteral.get(), lowerBoundType, upperLiteral.get(), upperBoundType)); | ||
| } | ||
| } | ||
|
|
||
| private void processRange(Supplier<Double> defaultSelectivity, Range<Float> boundaries) { | ||
| selectivityList.add(computeRangePredicateSelectivity(defaultSelectivity, ref, boundaries)); | ||
| } | ||
| }); | ||
|
|
||
| switch (inLiterals.size()) { | ||
| case 0: | ||
| break; | ||
| case 1: | ||
| selectivityList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, inLiterals.get(0)) | ||
| .accept(FilterSelectivityEstimator.this)); | ||
| break; | ||
| default: | ||
| List<RexNode> operands = new ArrayList<>(inLiterals.size() + 1); | ||
| operands.add(ref); | ||
| operands.addAll(inLiterals); | ||
| selectivityList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this)); | ||
| } | ||
|
|
||
| return selectivityList.size() == 1 ? selectivityList.get(0) : computeDisjunctionSelectivity(selectivityList); | ||
|
rubenada marked this conversation as resolved.
Outdated
|
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * NDV of "f1(x, y, z) != f2(p, q, r)" -> | ||
| * "(maxNDV(x,y,z,p,q,r) - 1)/maxNDV(x,y,z,p,q,r)". | ||
|
|
@@ -633,7 +780,11 @@ private Double computeNotEqualitySelectivity(RexCall call) { | |
| * @return | ||
| */ | ||
| private Double computeFunctionSelectivity(RexCall call) { | ||
| Double tmpNDV = getMaxNDV(call); | ||
| return computeFunctionSelectivity(call.getOperands()); | ||
| } | ||
|
|
||
| private Double computeFunctionSelectivity(List<RexNode> operands) { | ||
| Double tmpNDV = getMaxNDV(operands); | ||
| if (tmpNDV == null) { | ||
| // Could not be computed | ||
| return null; | ||
|
|
@@ -653,12 +804,20 @@ private Double computeFunctionSelectivity(RexCall call) { | |
| * @return | ||
| */ | ||
| private Double computeDisjunctionSelectivity(RexCall call) { | ||
| List<Double> selectivityList = new ArrayList<>(call.getOperands().size()); | ||
| for (RexNode dje : call.getOperands()) { | ||
| selectivityList.add(dje.accept(this)); | ||
| } | ||
| return computeDisjunctionSelectivity(selectivityList); | ||
| } | ||
|
|
||
| private double computeDisjunctionSelectivity(List<Double> selectivityList) { | ||
| Double tmpCardinality; | ||
| Double tmpSelectivity; | ||
| double selectivity = 1; | ||
|
|
||
| for (RexNode dje : call.getOperands()) { | ||
| tmpSelectivity = dje.accept(this); | ||
| for (Double sel : selectivityList) { | ||
| tmpSelectivity = sel; | ||
| if (tmpSelectivity == null) { | ||
| tmpSelectivity = 0.99; | ||
| } | ||
|
|
@@ -729,10 +888,14 @@ private long getMaxNulls(RexCall call, HiveTableScan t) { | |
| } | ||
|
|
||
| private Double getMaxNDV(RexCall call) { | ||
| return getMaxNDV(call.getOperands()); | ||
| } | ||
|
|
||
| private Double getMaxNDV(List<RexNode> operands) { | ||
| Double tmpNDV; | ||
| double maxNDV = 1.0; | ||
| InputReferencedVisitor irv; | ||
| for (RexNode op : call.getOperands()) { | ||
| for (RexNode op : operands) { | ||
| if (op instanceof RexInputRef) { | ||
| tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(this.childRel, mq, | ||
| ((RexInputRef) op).getIndex()); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.