@Override public void write(NullWritable nullWritable, V v) throws IOException { // if the batch is full, write it out. if (batch.size == batch.getMaxSize()) { writer.addRowBatch(batch); batch.reset(); } // add the new row int row = batch.size++; // skip over the OrcKey or OrcValue if (v instanceof OrcKey) { v = (V) ((OrcKey) v).key; } else if (v instanceof OrcValue) { v = (V) ((OrcValue) v).value; } if (isTopStruct) { for (int f = 0; f < schema.getChildren().size(); ++f) { OrcMapredRecordWriter.setColumn( schema.getChildren().get(f), batch.cols[f], row, ((OrcStruct) v).getFieldValue(f)); } } else { OrcMapredRecordWriter.setColumn(schema, batch.cols[0], row, v); } }
/* * Input array is used to fill the entire size of the vector row batch */ private VectorizedRowBatch getVectorizedRowBatch(int[] inputs, int size) { VectorizedRowBatch batch = new VectorizedRowBatch(2, size); LongColumnVector lcv = new LongColumnVector(size); for (int i = 0; i < size; i++) { lcv.vector[i] = inputs[i % inputs.length]; } batch.cols[0] = lcv; batch.cols[1] = new LongColumnVector(size); batch.size = size; return batch; }
private VectorizedRowBatch getVectorizedRandomRowBatch(int seed, int size) { VectorizedRowBatch batch = new VectorizedRowBatch(2, size); LongColumnVector lcv = new LongColumnVector(size); Random rand = new Random(seed); for (int i = 0; i < size; i++) { lcv.vector[i] = (rand.nextInt()); } batch.cols[0] = lcv; batch.cols[1] = new LongColumnVector(size); batch.size = size; return batch; }
/** * @param values * @return true if it is not done and can take more inputs */ private <E> boolean processVectors(Iterator<E> values, byte tag) throws HiveException { VectorizedRowBatch batch = batches[tag]; batch.reset(); /* deserialize key into columns */ VectorizedBatchUtil.addRowToBatchFrom(keyObject, keyStructInspector, 0, 0, batch, buffer); for (int i = 0; i < keysColumnOffset; i++) { VectorizedBatchUtil.setRepeatingColumn(batch, i); } int rowIdx = 0; try { while (values.hasNext()) { /* deserialize value into columns */ BytesWritable valueWritable = (BytesWritable) values.next(); Object valueObj = deserializeValue(valueWritable, tag); VectorizedBatchUtil.addRowToBatchFrom( valueObj, valueStructInspectors[tag], rowIdx, keysColumnOffset, batch, buffer); rowIdx++; if (rowIdx >= BATCH_SIZE) { VectorizedBatchUtil.setBatchSize(batch, rowIdx); reducer.process(batch, tag); rowIdx = 0; if (isLogInfoEnabled) { logMemoryInfo(); } } } if (rowIdx > 0) { VectorizedBatchUtil.setBatchSize(batch, rowIdx); reducer.process(batch, tag); } if (isLogInfoEnabled) { logMemoryInfo(); } } catch (Exception e) { String rowString = null; try { rowString = batch.toString(); } catch (Exception e2) { rowString = "[Error getting row data with exception " + StringUtils.stringifyException(e2) + " ]"; } throw new HiveException( "Error while processing vector batch (tag=" + tag + ") " + rowString, e); } return true; // give me more }
@Override public void evaluate(VectorizedRowBatch batch) { int n = batch.size; if (n <= 0) { return; } VectorExpression childExpr1 = this.childExpressions[0]; boolean prevSelectInUse = batch.selectedInUse; // Save the original selected vector int[] sel = batch.selected; if (batch.selectedInUse) { System.arraycopy(sel, 0, initialSelected, 0, n); } else { for (int i = 0; i < n; i++) { initialSelected[i] = i; sel[i] = i; } batch.selectedInUse = true; } childExpr1.evaluate(batch); // Preserve the selected reference and size values generated // after the first child is evaluated. int sizeAfterFirstChild = batch.size; int[] selectedAfterFirstChild = batch.selected; // Calculate unselected ones in last evaluate. for (int j = 0; j < n; j++) { tmp[initialSelected[j]] = 0; } for (int j = 0; j < batch.size; j++) { tmp[selectedAfterFirstChild[j]] = 1; } int unselectedSize = 0; for (int j = 0; j < n; j++) { int i = initialSelected[j]; if (tmp[i] == 0) { unselected[unselectedSize++] = i; } } int newSize = sizeAfterFirstChild; batch.selected = unselected; batch.size = unselectedSize; if (unselectedSize > 0) { // Evaluate subsequent child expression over unselected ones only. final int childrenCount = this.childExpressions.length; int childIndex = 1; while (true) { boolean isLastChild = (childIndex + 1 >= childrenCount); // When we have yet another child beyond the current one... save unselected. if (!isLastChild) { System.arraycopy(batch.selected, 0, unselectedCopy, 0, unselectedSize); } VectorExpression childExpr = this.childExpressions[childIndex]; childExpr.evaluate(batch); // Merge the result of last evaluate to previous evaluate. newSize += batch.size; for (int i = 0; i < batch.size; i++) { tmp[batch.selected[i]] = 1; } if (isLastChild) { break; } unselectedSize = subtract(unselectedCopy, unselectedSize, batch.selected, batch.size, difference); if (unselectedSize == 0) { break; } System.arraycopy(difference, 0, batch.selected, 0, unselectedSize); batch.size = unselectedSize; childIndex++; } } // Important: Restore the batch's selected array. batch.selected = selectedAfterFirstChild; int k = 0; for (int j = 0; j < n; j++) { int i = initialSelected[j]; if (tmp[i] == 1) { batch.selected[k++] = i; } } batch.size = newSize; if (newSize == n) { // Filter didn't do anything batch.selectedInUse = prevSelectInUse; } }
@Override public void process(Object row, int tag) throws HiveException { try { VectorizedRowBatch batch = (VectorizedRowBatch) row; alias = (byte) tag; if (needCommonSetup) { // Our one time process method initialization. commonSetup(batch); /* * Initialize Single-Column String members for this specialized class. */ singleJoinColumn = bigTableKeyColumnMap[0]; needCommonSetup = false; } if (needHashTableSetup) { // Setup our hash table specialization. It will be the first time the process // method is called, or after a Hybrid Grace reload. /* * Get our Single-Column String hash map information for this specialized class. */ hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; needHashTableSetup = false; } batchCounter++; // Do the per-batch setup for an outer join. outerPerBatchSetup(batch); // For outer join, DO NOT apply filters yet. It is incorrect for outer join to // apply the filter before hash table matching. final int inputLogicalSize = batch.size; if (inputLogicalSize == 0) { if (LOG.isDebugEnabled()) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } return; } // Perform any key expressions. Results will go into scratch columns. if (bigTableKeyExpressions != null) { for (VectorExpression ve : bigTableKeyExpressions) { ve.evaluate(batch); } } // We rebuild in-place the selected array with rows destine to be forwarded. int numSel = 0; /* * Single-Column String specific declarations. */ // The one join column for this specialized class. BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn]; byte[][] vector = joinColVector.vector; int[] start = joinColVector.start; int[] length = joinColVector.length; /* * Single-Column String check for repeating. */ // Check single column for repeating. boolean allKeyInputColumnsRepeating = joinColVector.isRepeating; if (allKeyInputColumnsRepeating) { /* * Repeating. */ // All key input columns are repeating. Generate key once. Lookup once. // Since the key is repeated, we must use entry 0 regardless of selectedInUse. /* * Single-Column String specific repeated lookup. */ JoinUtil.JoinResult joinResult; if (!joinColVector.noNulls && joinColVector.isNull[0]) { // Null key is no match for whole batch. joinResult = JoinUtil.JoinResult.NOMATCH; } else { // Handle *repeated* join key, if found. byte[] keyBytes = vector[0]; int keyStart = start[0]; int keyLength = length[0]; joinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[0]); } /* * Common repeated join result processing. */ if (LOG.isDebugEnabled()) { LOG.debug( CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); } numSel = finishOuterRepeated(batch, joinResult, hashMapResults[0], scratch1); } else { /* * NOT Repeating. */ if (LOG.isDebugEnabled()) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); } int selected[] = batch.selected; boolean selectedInUse = batch.selectedInUse; // For outer join we must apply the filter after match and cause some matches to become // non-matches, we do not track non-matches here. Instead we remember all non spilled rows // and compute non matches later in finishOuter. int hashMapResultCount = 0; int matchCount = 0; int nonSpillCount = 0; int spillCount = 0; /* * Single-Column String specific variables. */ int saveKeyBatchIndex = -1; // We optimize performance by only looking up the first key in a series of equal keys. boolean haveSaveKey = false; JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; // Logical loop over the rows in the batch since the batch may have selected in use. for (int logical = 0; logical < inputLogicalSize; logical++) { int batchIndex = (selectedInUse ? selected[logical] : logical); /* * Single-Column String outer null detection. */ boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex]; if (isNull) { // Have that the NULL does not interfere with the current equal key series, if there // is one. We do not set saveJoinResult. // // Let a current MATCH equal key series keep going, or // Let a current SPILL equal key series keep going, or // Let a current NOMATCH keep not matching. // Remember non-matches for Outer Join. nonSpills[nonSpillCount++] = batchIndex; // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " // NULL"); } else { /* * Single-Column String outer get key. */ // Implicit -- use batchIndex. /* * Equal key series checking. */ if (!haveSaveKey || StringExpr.compare( vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], vector[batchIndex], start[batchIndex], length[batchIndex]) != 0) { // New key. if (haveSaveKey) { // Move on with our count(s). switch (saveJoinResult) { case MATCH: case SPILL: hashMapResultCount++; break; case NOMATCH: break; } } // Regardless of our matching result, we keep that information to make multiple use // of it for a possible series of equal keys. haveSaveKey = true; /* * Single-Column String specific save key. */ saveKeyBatchIndex = batchIndex; /* * Single-Column Long specific lookup key. */ byte[] keyBytes = vector[batchIndex]; int keyStart = start[batchIndex]; int keyLength = length[batchIndex]; saveJoinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[hashMapResultCount]); // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " New // Key " + saveJoinResult.name()); } else { // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key // Continues " + saveJoinResult.name()); } /* * Common outer join result processing. */ switch (saveJoinResult) { case MATCH: matchs[matchCount] = batchIndex; matchHashMapResultIndices[matchCount] = hashMapResultCount; matchCount++; nonSpills[nonSpillCount++] = batchIndex; break; case SPILL: spills[spillCount] = batchIndex; spillHashMapResultIndices[spillCount] = hashMapResultCount; spillCount++; break; case NOMATCH: nonSpills[nonSpillCount++] = batchIndex; // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH // duplicate"); break; } } } if (haveSaveKey) { // Account for last equal key sequence. switch (saveJoinResult) { case MATCH: case SPILL: hashMapResultCount++; break; case NOMATCH: break; } } if (LOG.isDebugEnabled()) { LOG.debug( CLASS_NAME + " batch #" + batchCounter + " matchs " + intArrayToRangesString(matchs, matchCount) + " matchHashMapResultIndices " + intArrayToRangesString(matchHashMapResultIndices, matchCount) + " nonSpills " + intArrayToRangesString(nonSpills, nonSpillCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); } // We will generate results for all matching and non-matching rows. // Note that scratch1 is undefined at this point -- it's preallocated storage. numSel = finishOuter( batch, matchs, matchHashMapResultIndices, matchCount, nonSpills, nonSpillCount, spills, spillHashMapResultIndices, spillCount, hashMapResults, hashMapResultCount, scratch1); } batch.selectedInUse = true; batch.size = numSel; if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); } } catch (IOException e) { throw new HiveException(e); } catch (Exception e) { throw new HiveException(e); } }