/** * Determine the partition group that has the maximum intersection in terms of the estimated ndv * of the partition exprs with groupingExprs. That partition group is placed at the front of * partitionGroups, with its partition exprs reduced to the intersection, and the intersecting * groupingExprs are returned in inputPartitionExprs. */ private void computeInputPartitionExprs( List<PartitionGroup> partitionGroups, List<Expr> groupingExprs, int numNodes, List<Expr> inputPartitionExprs) { inputPartitionExprs.clear(); // find partition group with maximum intersection long maxNdv = 0; PartitionGroup maxPg = null; List<Expr> maxGroupingExprs = null; for (PartitionGroup pg : partitionGroups) { List<Expr> l1 = Lists.newArrayList(); List<Expr> l2 = Lists.newArrayList(); Expr.intersect( analyzer_, pg.partitionByExprs, groupingExprs, analyzer_.getEquivClassSmap(), l1, l2); // TODO: also look at l2 and take the max? long ndv = Expr.getNumDistinctValues(l1); if (ndv < 0 || ndv < numNodes || ndv < maxNdv) continue; // found a better partition group maxPg = pg; maxPg.partitionByExprs = l1; maxGroupingExprs = l2; maxNdv = ndv; } if (maxNdv > numNodes) { Preconditions.checkNotNull(maxPg); // we found a partition group that gives us enough parallelism; // move it to the front partitionGroups.remove(maxPg); partitionGroups.add(0, maxPg); inputPartitionExprs.addAll(maxGroupingExprs); } }
// Append a flattened version of this plan node, including all children, to 'container'. private void treeToThriftHelper(TPlan container) { TPlanNode msg = new TPlanNode(); msg.node_id = id_.asInt(); msg.limit = limit_; TExecStats estimatedStats = new TExecStats(); estimatedStats.setCardinality(cardinality_); estimatedStats.setMemory_used(perHostMemCost_); msg.setLabel(getDisplayLabel()); msg.setLabel_detail(getDisplayLabelDetail()); msg.setEstimated_stats(estimatedStats); msg.setRow_tuples(Lists.<Integer>newArrayListWithCapacity(tupleIds_.size())); msg.setNullable_tuples(Lists.<Boolean>newArrayListWithCapacity(tupleIds_.size())); for (TupleId tid : tupleIds_) { msg.addToRow_tuples(tid.asInt()); msg.addToNullable_tuples(nullableTupleIds_.contains(tid)); } for (Expr e : conjuncts_) { msg.addToConjuncts(e.treeToThrift()); } toThrift(msg); container.addToNodes(msg); // For the purpose of the BE consider ExchangeNodes to have no children. if (this instanceof ExchangeNode) { msg.num_children = 0; return; } else { msg.num_children = children_.size(); for (PlanNode child : children_) { child.treeToThriftHelper(container); } } }
/** Create a predicate that checks if all exprs are equal or both sides are null. */ private Expr createNullMatchingEquals( List<Expr> exprs, TupleId inputTid, ExprSubstitutionMap bufferedSmap) { Preconditions.checkState(!exprs.isEmpty()); Expr result = createNullMatchingEqualsAux(exprs, 0, inputTid, bufferedSmap); result.analyzeNoThrow(analyzer_); return result; }
/** Create SortInfo, including sort tuple, to sort entire input row on sortExprs. */ private SortInfo createSortInfo( PlanNode input, List<Expr> sortExprs, List<Boolean> isAsc, List<Boolean> nullsFirst) { // create tuple for sort output = the entire materialized input in a single tuple TupleDescriptor sortTupleDesc = analyzer_.getDescTbl().createTupleDescriptor("sort-tuple"); ExprSubstitutionMap sortSmap = new ExprSubstitutionMap(); List<Expr> sortSlotExprs = Lists.newArrayList(); sortTupleDesc.setIsMaterialized(true); for (TupleId tid : input.getTupleIds()) { TupleDescriptor tupleDesc = analyzer_.getTupleDesc(tid); for (SlotDescriptor inputSlotDesc : tupleDesc.getSlots()) { if (!inputSlotDesc.isMaterialized()) continue; SlotDescriptor sortSlotDesc = analyzer_.copySlotDescriptor(inputSlotDesc, sortTupleDesc); // all output slots need to be materialized sortSlotDesc.setIsMaterialized(true); sortSmap.put(new SlotRef(inputSlotDesc), new SlotRef(sortSlotDesc)); sortSlotExprs.add(new SlotRef(inputSlotDesc)); } } SortInfo sortInfo = new SortInfo(Expr.substituteList(sortExprs, sortSmap, analyzer_, false), isAsc, nullsFirst); LOG.trace("sortinfo exprs: " + Expr.debugString(sortInfo.getOrderingExprs())); sortInfo.setMaterializedTupleInfo(sortTupleDesc, sortSlotExprs); return sortInfo; }
/** Marks all slots referenced in exprs as materialized. */ protected void markSlotsMaterialized(Analyzer analyzer, List<Expr> exprs) { List<SlotId> refdIdList = Lists.newArrayList(); for (Expr expr : exprs) { expr.getIds(null, refdIdList); } analyzer.getDescTbl().markSlotsMaterialized(refdIdList); }
/** Compute the product of the selectivies of all conjuncts. */ protected double computeSelectivity() { double prod = 1.0; for (Expr e : conjuncts_) { if (e.getSelectivity() < 0) continue; prod *= e.getSelectivity(); } return prod; }
public String getExplainString(TExplainLevel explainLevel) { StringBuilder str = new StringBuilder(); str.append(type.toString()); if (!partitionExprs.isEmpty()) { List<String> strings = Lists.newArrayList(); for (Expr expr : partitionExprs) { strings.add(expr.toSql()); } str.append(": " + Joiner.on(", ").join(strings)); } str.append("\n"); return str.toString(); }
@Override protected void toThrift(TPlanNode msg) { msg.node_type = TPlanNodeType.AGGREGATION_NODE; msg.agg_node = new TAggregationNode( Expr.treesToThrift(aggInfo.getAggregateExprs()), aggInfo.getAggTupleId().asInt(), needsFinalize); List<Expr> groupingExprs = aggInfo.getGroupingExprs(); if (groupingExprs != null) { msg.agg_node.setGrouping_exprs(Expr.treesToThrift(groupingExprs)); } }
public TDataPartition toThrift() { TDataPartition result = new TDataPartition(type); if (partitionExprs != null) { result.setPartition_exprs(Expr.treesToThrift(partitionExprs)); } return result; }
protected String debugString() { // not using Objects.toStrHelper because // PlanNode.debugString() is embedded by debug strings of the subclasses StringBuilder output = new StringBuilder(); output.append("preds=" + Expr.debugString(conjuncts_)); output.append(" limit=" + Long.toString(limit_)); return output.toString(); }
/** Copy c'tor. Also passes in new id_. */ protected PlanNode(PlanNodeId id, PlanNode node, String displayName) { id_ = id; limit_ = node.limit_; tupleIds_ = Lists.newArrayList(node.tupleIds_); tblRefIds_ = Lists.newArrayList(node.tblRefIds_); nullableTupleIds_ = Sets.newHashSet(node.nullableTupleIds_); conjuncts_ = Expr.cloneList(node.conjuncts_); cardinality_ = -1; numNodes_ = -1; displayName_ = displayName; }
/** * True if the partition exprs and ordering elements and the window of analyticExpr match ours. */ public boolean isCompatible(AnalyticExpr analyticExpr) { if (requiresIndependentEval(analyticExprs.get(0)) || requiresIndependentEval(analyticExpr)) { return false; } if (!Expr.equalSets(analyticExpr.getPartitionExprs(), partitionByExprs)) { return false; } if (!analyticExpr.getOrderByElements().equals(orderByElements)) return false; if ((window == null) != (analyticExpr.getWindow() == null)) return false; if (window == null) return true; return analyticExpr.getWindow().equals(window); }
public THdfsPartition toThrift() { List<TExpr> thriftExprs = Expr.treesToThrift(getPartitionValues()); return new THdfsPartition( (byte) fileFormatDescriptor.getLineDelim(), (byte) fileFormatDescriptor.getFieldDelim(), (byte) fileFormatDescriptor.getCollectionDelim(), (byte) fileFormatDescriptor.getMapKeyDelim(), (byte) fileFormatDescriptor.getEscapeChar(), fileFormatDescriptor.getFileFormat().toThrift(), thriftExprs, fileFormatDescriptor.getBlockSize(), fileFormatDescriptor.getCompression()); }
/** * Return true if 'this' and other have compatible partition exprs and our orderByElements are a * prefix of other's. */ public boolean isPrefixOf(SortGroup other) { if (other.orderByElements.size() > orderByElements.size()) return false; if (!Expr.equalSets(partitionByExprs, other.partitionByExprs)) return false; for (int i = 0; i < other.orderByElements.size(); ++i) { OrderByElement ob = orderByElements.get(i); OrderByElement otherOb = other.orderByElements.get(i); // TODO: compare equiv classes by comparing each equiv class's placeholder // slotref if (!ob.getExpr().equals(otherOb.getExpr())) return false; if (ob.isAsc() != otherOb.isAsc()) return false; if (ob.nullsFirst() != otherOb.nullsFirst()) return false; } return true; }
/** * Create an unanalyzed predicate that checks if elements >= i are equal or both sides are null. * * <p>The predicate has the form ((lhs[i] is null && rhs[i] is null) || ( lhs[i] is not null && * rhs[i] is not null && lhs[i] = rhs[i])) && <createEqualsAux(i + 1)> */ private Expr createNullMatchingEqualsAux( List<Expr> elements, int i, TupleId inputTid, ExprSubstitutionMap bufferedSmap) { if (i > elements.size() - 1) return new BoolLiteral(true); // compare elements[i] Expr lhs = elements.get(i); Preconditions.checkState(lhs.isBound(inputTid)); Expr rhs = lhs.substitute(bufferedSmap, analyzer_, false); Expr bothNull = new CompoundPredicate( Operator.AND, new IsNullPredicate(lhs, false), new IsNullPredicate(rhs, false)); Expr lhsEqRhsNotNull = new CompoundPredicate( Operator.AND, new CompoundPredicate( Operator.AND, new IsNullPredicate(lhs, true), new IsNullPredicate(rhs, true)), new BinaryPredicate(BinaryPredicate.Operator.EQ, lhs, rhs)); Expr remainder = createNullMatchingEqualsAux(elements, i + 1, inputTid, bufferedSmap); return new CompoundPredicate( CompoundPredicate.Operator.AND, new CompoundPredicate(Operator.OR, bothNull, lhsEqRhsNotNull), remainder); }
/** * Coalesce partition groups for which the intersection of their partition exprs has ndv estimate * > numNodes, so that the resulting plan still parallelizes across all nodes. */ private void mergePartitionGroups(List<PartitionGroup> partitionGroups, int numNodes) { boolean hasMerged = false; do { hasMerged = false; for (PartitionGroup pg1 : partitionGroups) { for (PartitionGroup pg2 : partitionGroups) { if (pg1 != pg2) { long ndv = Expr.getNumDistinctValues( Expr.intersect(pg1.partitionByExprs, pg2.partitionByExprs)); if (ndv == -1 || ndv < 0 || ndv < numNodes) { // didn't get a usable value or the number of partitions is too small continue; } pg1.merge(pg2); partitionGroups.remove(pg2); hasMerged = true; break; } } if (hasMerged) break; } } while (hasMerged); }
public String debugString() { return Objects.toStringHelper(this) .add("type", type) .addValue(Expr.debugString(partitionExprs)) .toString(); }
/** * Create HdfsPartition objects corresponding to 'partitions'. * * <p>If there are no partitions in the Hive metadata, a single partition is added with no * partition keys. * * <p>For files that have not been changed, reuses file descriptors from oldFileDescMap. */ private void loadPartitions( List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions, org.apache.hadoop.hive.metastore.api.Table msTbl, Map<String, FileDescriptor> oldFileDescMap) throws IOException, CatalogException { partitions_.clear(); hdfsBaseDir_ = msTbl.getSd().getLocation(); List<FileDescriptor> newFileDescs = Lists.newArrayList(); // INSERT statements need to refer to this if they try to write to new partitions // Scans don't refer to this because by definition all partitions they refer to // exist. addDefaultPartition(msTbl.getSd()); if (msTbl.getPartitionKeysSize() == 0) { Preconditions.checkArgument(msPartitions == null || msPartitions.isEmpty()); // This table has no partition key, which means it has no declared partitions. // We model partitions slightly differently to Hive - every file must exist in a // partition, so add a single partition with no keys which will get all the // files in the table's root directory. addPartition(msTbl.getSd(), null, new ArrayList<LiteralExpr>(), oldFileDescMap, newFileDescs); Path location = new Path(hdfsBaseDir_); if (DFS.exists(location)) { accessLevel_ = getAvailableAccessLevel(location); } } else { // keep track of distinct partition key values and how many nulls there are Set<String>[] uniquePartitionKeys = new HashSet[numClusteringCols_]; long[] numNullKeys = new long[numClusteringCols_]; for (int i = 0; i < numClusteringCols_; ++i) { uniquePartitionKeys[i] = new HashSet<String>(); numNullKeys[i] = 0; } for (org.apache.hadoop.hive.metastore.api.Partition msPartition : msPartitions) { // load key values List<LiteralExpr> keyValues = Lists.newArrayList(); int i = 0; for (String partitionKey : msPartition.getValues()) { uniquePartitionKeys[i].add(partitionKey); // Deal with Hive's special NULL partition key. if (partitionKey.equals(nullPartitionKeyValue_)) { keyValues.add(new NullLiteral()); ++numNullKeys[i]; } else { ColumnType type = colsByPos_.get(keyValues.size()).getType(); try { Expr expr = LiteralExpr.create(partitionKey, type); // Force the literal to be of type declared in the metadata. expr = expr.castTo(type); keyValues.add((LiteralExpr) expr); } catch (AnalysisException ex) { LOG.warn("Failed to create literal expression of type: " + type, ex); throw new InvalidStorageDescriptorException(ex); } } ++i; } HdfsPartition partition = addPartition(msPartition.getSd(), msPartition, keyValues, oldFileDescMap, newFileDescs); // If the partition is null, its HDFS path does not exist, and it was not added to // this table's partition list. Skip the partition. if (partition == null) continue; if (msPartition.getParameters() != null) { partition.setNumRows(getRowCount(msPartition.getParameters())); } if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) { // TODO: READ_ONLY isn't exactly correct because the it's possible the // partition does not have READ permissions either. When we start checking // whether we can READ from a table, this should be updated to set the // table's access level to the "lowest" effective level across all // partitions. That is, if one partition has READ_ONLY and another has // WRITE_ONLY the table's access level should be NONE. accessLevel_ = TAccessLevel.READ_ONLY; } } // update col stats for partition key cols for (int i = 0; i < numClusteringCols_; ++i) { ColumnStats stats = colsByPos_.get(i).getStats(); stats.setNumNulls(numNullKeys[i]); stats.setNumDistinctValues(uniquePartitionKeys[i].size()); LOG.debug("#col=" + Integer.toString(i) + " stats=" + stats.toString()); } } if (newFileDescs.size() > 0) { loadBlockMd(newFileDescs); } uniqueHostPortsCount_ = countUniqueDataNetworkLocations(partitions_); }
/** * Create plan tree for the entire sort group, including all contained window groups. Marks the * SortNode as requiring its input to be partitioned if partitionExprs is not null (partitionExprs * represent the data partition of the entire partition group of which this sort group is a part). */ private PlanNode createSortGroupPlan( PlanNode root, SortGroup sortGroup, List<Expr> partitionExprs) throws ImpalaException { List<Expr> partitionByExprs = sortGroup.partitionByExprs; List<OrderByElement> orderByElements = sortGroup.orderByElements; ExprSubstitutionMap sortSmap = null; TupleId sortTupleId = null; TupleDescriptor bufferedTupleDesc = null; // map from input to buffered tuple ExprSubstitutionMap bufferedSmap = new ExprSubstitutionMap(); // sort on partition by (pb) + order by (ob) exprs and create pb/ob predicates if (!partitionByExprs.isEmpty() || !orderByElements.isEmpty()) { // first sort on partitionExprs (direction doesn't matter) List<Expr> sortExprs = Lists.newArrayList(partitionByExprs); List<Boolean> isAsc = Lists.newArrayList(Collections.nCopies(sortExprs.size(), new Boolean(true))); // TODO: utilize a direction and nulls/first last that has benefit // for subsequent sort groups List<Boolean> nullsFirst = Lists.newArrayList(Collections.nCopies(sortExprs.size(), new Boolean(true))); // then sort on orderByExprs for (OrderByElement orderByElement : sortGroup.orderByElements) { sortExprs.add(orderByElement.getExpr()); isAsc.add(orderByElement.isAsc()); nullsFirst.add(orderByElement.getNullsFirstParam()); } SortInfo sortInfo = createSortInfo(root, sortExprs, isAsc, nullsFirst); SortNode sortNode = new SortNode(idGenerator_.getNextId(), root, sortInfo, false, 0); // if this sort group does not have partitioning exprs, we want the sort // to be executed like a regular distributed sort if (!partitionByExprs.isEmpty()) sortNode.setIsAnalyticSort(true); if (partitionExprs != null) { // create required input partition DataPartition inputPartition = DataPartition.UNPARTITIONED; if (!partitionExprs.isEmpty()) { inputPartition = new DataPartition(TPartitionType.HASH_PARTITIONED, partitionExprs); } sortNode.setInputPartition(inputPartition); } root = sortNode; root.init(analyzer_); sortSmap = sortNode.getOutputSmap(); // create bufferedTupleDesc and bufferedSmap sortTupleId = sortNode.tupleIds_.get(0); bufferedTupleDesc = analyzer_.getDescTbl().copyTupleDescriptor(sortTupleId, "buffered-tuple"); LOG.trace("desctbl: " + analyzer_.getDescTbl().debugString()); List<SlotDescriptor> inputSlots = analyzer_.getTupleDesc(sortTupleId).getSlots(); List<SlotDescriptor> bufferedSlots = bufferedTupleDesc.getSlots(); for (int i = 0; i < inputSlots.size(); ++i) { bufferedSmap.put(new SlotRef(inputSlots.get(i)), new SlotRef(bufferedSlots.get(i))); } } // create one AnalyticEvalNode per window group for (WindowGroup windowGroup : sortGroup.windowGroups) { // Create partition-by (pb) and order-by (ob) less-than predicates between the // input tuple (the output of the preceding sort) and a buffered tuple that is // identical to the input tuple. We need a different tuple descriptor for the // buffered tuple because the generated predicates should compare two different // tuple instances from the same input stream (i.e., the predicates should be // evaluated over a row that is composed of the input and the buffered tuple). // we need to remap the pb/ob exprs to a) the sort output, b) our buffer of the // sort input Expr partitionByEq = null; if (!windowGroup.partitionByExprs.isEmpty()) { partitionByEq = createNullMatchingEquals( Expr.substituteList(windowGroup.partitionByExprs, sortSmap, analyzer_, false), sortTupleId, bufferedSmap); LOG.trace("partitionByEq: " + partitionByEq.debugString()); } Expr orderByEq = null; if (!windowGroup.orderByElements.isEmpty()) { orderByEq = createNullMatchingEquals( OrderByElement.getOrderByExprs( OrderByElement.substitute(windowGroup.orderByElements, sortSmap, analyzer_)), sortTupleId, bufferedSmap); LOG.trace("orderByEq: " + orderByEq.debugString()); } root = new AnalyticEvalNode( idGenerator_.getNextId(), root, stmtTupleIds_, windowGroup.analyticFnCalls, windowGroup.partitionByExprs, windowGroup.orderByElements, windowGroup.window, analyticInfo_.getOutputTupleDesc(), windowGroup.physicalIntermediateTuple, windowGroup.physicalOutputTuple, windowGroup.logicalToPhysicalSmap, partitionByEq, orderByEq, bufferedTupleDesc); root.init(analyzer_); } return root; }
/** True if the partition and ordering exprs of windowGroup match ours. */ public boolean isCompatible(WindowGroup windowGroup) { return Expr.equalSets(windowGroup.partitionByExprs, partitionByExprs) && windowGroup.orderByElements.equals(orderByElements); }
/** * True if the partition exprs of sortGroup are compatible with ours. For now that means * equality. */ public boolean isCompatible(SortGroup sortGroup) { return Expr.equalSets(sortGroup.partitionByExprs, partitionByExprs); }
/** * Merge 'other' into 'this' - partitionByExprs is the intersection of the two - sortGroups * becomes the union */ public void merge(PartitionGroup other) { partitionByExprs = Expr.intersect(partitionByExprs, other.partitionByExprs); Preconditions.checkState(Expr.getNumDistinctValues(partitionByExprs) >= 0); sortGroups.addAll(other.sortGroups); }
/** * Sets outputSmap_ to compose(existing smap, combined child smap). Also substitutes conjuncts_ * using the combined child smap. */ protected void createDefaultSmap(Analyzer analyzer) { ExprSubstitutionMap combinedChildSmap = getCombinedChildSmap(); outputSmap_ = ExprSubstitutionMap.compose(outputSmap_, combinedChildSmap, analyzer); conjuncts_ = Expr.substituteList(conjuncts_, outputSmap_, analyzer, false); }