public JoinUtil.JoinResult setFromOutput(Output output) { aliasFilter = hashMap.getValueResult(output.getData(), 0, output.getLength(), hashMapResult); dummyRow = null; if (hashMapResult.hasRows()) { return JoinUtil.JoinResult.MATCH; } else { aliasFilter = (byte) 0xff; return JoinUtil.JoinResult.NOMATCH; } }
@Override public void process(Object row, int tag) throws HiveException { try { VectorizedRowBatch batch = (VectorizedRowBatch) row; alias = (byte) tag; if (needCommonSetup) { // Our one time process method initialization. commonSetup(batch); /* * Initialize Multi-Key members for this specialized class. */ keyVectorSerializeWrite = new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length)); keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap); currentKeyOutput = new Output(); saveKeyOutput = new Output(); needCommonSetup = false; } if (needHashTableSetup) { // Setup our hash table specialization. It will be the first time the process // method is called, or after a Hybrid Grace reload. /* * Get our Multi-Key hash map information for this specialized class. */ hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable; needHashTableSetup = false; } batchCounter++; final int inputLogicalSize = batch.size; if (inputLogicalSize == 0) { if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty"); } return; } // Do the per-batch setup for an outer join. outerPerBatchSetup(batch); // For outer join, remember our input rows before ON expression filtering or before // hash table matching so we can generate results for all rows (matching and non matching) // later. boolean inputSelectedInUse = batch.selectedInUse; if (inputSelectedInUse) { // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) { // throw new HiveException("batch.selected is not in sort order and unique"); // } System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize); } // Filtering for outer join just removes rows available for hash table matching. boolean someRowsFilteredOut = false; if (bigTableFilterExpressions.length > 0) { // Since the input for (VectorExpression ve : bigTableFilterExpressions) { ve.evaluate(batch); } someRowsFilteredOut = (batch.size != inputLogicalSize); if (isLogDebugEnabled) { if (batch.selectedInUse) { if (inputSelectedInUse) { LOG.debug( CLASS_NAME + " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); } else { LOG.debug( CLASS_NAME + " inputLogicalSize " + inputLogicalSize + " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size)); } } } } // Perform any key expressions. Results will go into scratch columns. if (bigTableKeyExpressions != null) { for (VectorExpression ve : bigTableKeyExpressions) { ve.evaluate(batch); } } /* * Multi-Key specific declarations. */ // None. /* * Multi-Key Long check for repeating. */ // If all BigTable input columns to key expressions are isRepeating, then // calculate key once; lookup once. // Also determine if any nulls are present since for a join that means no match. boolean allKeyInputColumnsRepeating; boolean someKeyInputColumnIsNull = false; // Only valid if allKeyInputColumnsRepeating is true. if (bigTableKeyColumnMap.length == 0) { allKeyInputColumnsRepeating = false; } else { allKeyInputColumnsRepeating = true; for (int i = 0; i < bigTableKeyColumnMap.length; i++) { ColumnVector colVector = batch.cols[bigTableKeyColumnMap[i]]; if (!colVector.isRepeating) { allKeyInputColumnsRepeating = false; break; } if (!colVector.noNulls && colVector.isNull[0]) { someKeyInputColumnIsNull = true; } } } if (allKeyInputColumnsRepeating) { /* * Repeating. */ // All key input columns are repeating. Generate key once. Lookup once. // Since the key is repeated, we must use entry 0 regardless of selectedInUse. /* * Multi-Key specific repeated lookup. */ JoinUtil.JoinResult joinResult; if (batch.size == 0) { // Whole repeated key batch was filtered out. joinResult = JoinUtil.JoinResult.NOMATCH; } else if (someKeyInputColumnIsNull) { // Any (repeated) null key column is no match for whole batch. joinResult = JoinUtil.JoinResult.NOMATCH; } else { // All key input columns are repeating. Generate key once. Lookup once. keyVectorSerializeWrite.setOutput(currentKeyOutput); keyVectorSerializeWrite.serializeWrite(batch, 0); byte[] keyBytes = currentKeyOutput.getData(); int keyLength = currentKeyOutput.getLength(); joinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[0]); } /* * Common repeated join result processing. */ if (isLogDebugEnabled) { LOG.debug( CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name()); } finishOuterRepeated( batch, joinResult, hashMapResults[0], someRowsFilteredOut, inputSelectedInUse, inputLogicalSize); } else { /* * NOT Repeating. */ if (isLogDebugEnabled) { LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated"); } int selected[] = batch.selected; boolean selectedInUse = batch.selectedInUse; int hashMapResultCount = 0; int allMatchCount = 0; int equalKeySeriesCount = 0; int spillCount = 0; boolean atLeastOneNonMatch = someRowsFilteredOut; /* * Multi-Key specific variables. */ Output temp; // We optimize performance by only looking up the first key in a series of equal keys. boolean haveSaveKey = false; JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH; // Logical loop over the rows in the batch since the batch may have selected in use. for (int logical = 0; logical < batch.size; logical++) { int batchIndex = (selectedInUse ? selected[logical] : logical); // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + // getOperatorId() + " candidate " + CLASS_NAME + " batch"); /* * Multi-Key outer null detection. */ // Generate binary sortable key for current row in vectorized row batch. keyVectorSerializeWrite.setOutput(currentKeyOutput); keyVectorSerializeWrite.serializeWrite(batch, batchIndex); if (keyVectorSerializeWrite.getHasAnyNulls()) { // Have that the NULL does not interfere with the current equal key series, if there // is one. We do not set saveJoinResult. // // Let a current MATCH equal key series keep going, or // Let a current SPILL equal key series keep going, or // Let a current NOMATCH keep not matching. atLeastOneNonMatch = true; // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " // NULL"); } else { /* * Multi-Key outer get key. */ // Generated earlier to get possible null(s). /* * Equal key series checking. */ if (!haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) { // New key. if (haveSaveKey) { // Move on with our counts. switch (saveJoinResult) { case MATCH: hashMapResultCount++; equalKeySeriesCount++; break; case SPILL: hashMapResultCount++; break; case NOMATCH: break; } } // Regardless of our matching result, we keep that information to make multiple use // of it for a possible series of equal keys. haveSaveKey = true; /* * Multi-Key specific save key. */ temp = saveKeyOutput; saveKeyOutput = currentKeyOutput; currentKeyOutput = temp; /* * Multi-Key specific lookup key. */ byte[] keyBytes = saveKeyOutput.getData(); int keyLength = saveKeyOutput.getLength(); saveJoinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[hashMapResultCount]); /* * Common outer join result processing. */ switch (saveJoinResult) { case MATCH: equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount; equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount; equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow(); equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1; allMatchs[allMatchCount++] = batchIndex; // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH // isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " // currentKey " + currentKey); break; case SPILL: spills[spillCount] = batchIndex; spillHashMapResultIndices[spillCount] = hashMapResultCount; spillCount++; break; case NOMATCH: atLeastOneNonMatch = true; // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " // NOMATCH" + " currentKey " + currentKey); break; } } else { // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key // Continues " + saveKey + " " + saveJoinResult.name()); // Series of equal keys. switch (saveJoinResult) { case MATCH: equalKeySeriesDuplicateCounts[equalKeySeriesCount]++; allMatchs[allMatchCount++] = batchIndex; // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH // duplicate"); break; case SPILL: spills[spillCount] = batchIndex; spillHashMapResultIndices[spillCount] = hashMapResultCount; spillCount++; break; case NOMATCH: // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " // NOMATCH duplicate"); break; } } // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) { // throw new HiveException("allMatchs is not in sort order and unique"); // } } } if (haveSaveKey) { // Update our counts for the last key. switch (saveJoinResult) { case MATCH: hashMapResultCount++; equalKeySeriesCount++; break; case SPILL: hashMapResultCount++; break; case NOMATCH: break; } } if (isLogDebugEnabled) { LOG.debug( CLASS_NAME + " batch #" + batchCounter + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesIsSingleValue " + Arrays.toString( Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + " equalKeySeriesDuplicateCounts " + Arrays.toString( Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + " atLeastOneNonMatch " + atLeastOneNonMatch + " inputSelectedInUse " + inputSelectedInUse + " inputLogicalSize " + inputLogicalSize + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount))); } // We will generate results for all matching and non-matching rows. finishOuter( batch, allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, inputSelectedInUse, inputLogicalSize, spillCount, hashMapResultCount); } if (batch.size > 0) { // Forward any remaining selected rows. forwardBigTableBatch(batch); } } catch (IOException e) { throw new HiveException(e); } catch (Exception e) { throw new HiveException(e); } }
private void testBinarySortableFast( SerdeRandomRowSource source, Object[][] rows, boolean[] columnSortOrderIsDesc, byte[] columnNullMarker, byte[] columnNotNullMarker, AbstractSerDe serde, StructObjectInspector rowOI, AbstractSerDe serde_fewer, StructObjectInspector writeRowOI, boolean ascending, PrimitiveTypeInfo[] primitiveTypeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable { int rowCount = rows.length; int columnCount = primitiveTypeInfos.length; boolean[] columnsToInclude = null; if (useIncludeColumns) { columnsToInclude = new boolean[columnCount]; for (int i = 0; i < columnCount; i++) { columnsToInclude[i] = r.nextBoolean(); } } int writeColumnCount = columnCount; if (doWriteFewerColumns) { writeColumnCount = writeRowOI.getAllStructFieldRefs().size(); } BinarySortableSerializeWrite binarySortableSerializeWrite = new BinarySortableSerializeWrite( columnSortOrderIsDesc, columnNullMarker, columnNotNullMarker); // Try to serialize // One Writable per row. BytesWritable serializeWriteBytes[] = new BytesWritable[rowCount]; int[][] perFieldWriteLengthsArray = new int[rowCount][]; for (int i = 0; i < rowCount; i++) { Object[] row = rows[i]; Output output = new Output(); binarySortableSerializeWrite.set(output); int[] perFieldWriteLengths = new int[columnCount]; for (int index = 0; index < writeColumnCount; index++) { Writable writable = (Writable) row[index]; VerifyFast.serializeWrite( binarySortableSerializeWrite, primitiveTypeInfos[index], writable); perFieldWriteLengths[index] = output.getLength(); } perFieldWriteLengthsArray[i] = perFieldWriteLengths; BytesWritable bytesWritable = new BytesWritable(); bytesWritable.set(output.getData(), 0, output.getLength()); serializeWriteBytes[i] = bytesWritable; if (i > 0) { int compareResult = serializeWriteBytes[i - 1].compareTo(serializeWriteBytes[i]); if ((compareResult < 0 && !ascending) || (compareResult > 0 && ascending)) { System.out.println( "Test failed in " + (ascending ? "ascending" : "descending") + " order with " + (i - 1) + " and " + i); System.out.println( "serialized data [" + (i - 1) + "] = " + TestBinarySortableSerDe.hexString(serializeWriteBytes[i - 1])); System.out.println( "serialized data [" + i + "] = " + TestBinarySortableSerDe.hexString(serializeWriteBytes[i])); fail("Sort order of serialized " + (i - 1) + " and " + i + " are reversed!"); } } } // Try to deserialize using DeserializeRead our Writable row objects created by SerializeWrite. for (int i = 0; i < rowCount; i++) { Object[] row = rows[i]; BinarySortableDeserializeRead binarySortableDeserializeRead = new BinarySortableDeserializeRead( primitiveTypeInfos, /* useExternalBuffer */ false, columnSortOrderIsDesc); BytesWritable bytesWritable = serializeWriteBytes[i]; binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); for (int index = 0; index < columnCount; index++) { if (useIncludeColumns && !columnsToInclude[index]) { binarySortableDeserializeRead.skipNextField(); } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead( binarySortableDeserializeRead, primitiveTypeInfos[index], null); } else { Writable writable = (Writable) row[index]; VerifyFast.verifyDeserializeRead( binarySortableDeserializeRead, primitiveTypeInfos[index], writable); } } if (writeColumnCount == columnCount) { TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached()); } /* * Clip off one byte and expect to get an EOFException on the write field. */ BinarySortableDeserializeRead binarySortableDeserializeRead2 = new BinarySortableDeserializeRead( primitiveTypeInfos, /* useExternalBuffer */ false, columnSortOrderIsDesc); binarySortableDeserializeRead2.set( bytesWritable.getBytes(), 0, bytesWritable.getLength() - 1); // One fewer byte. for (int index = 0; index < writeColumnCount; index++) { Writable writable = (Writable) row[index]; if (index == writeColumnCount - 1) { boolean threw = false; try { VerifyFast.verifyDeserializeRead( binarySortableDeserializeRead2, primitiveTypeInfos[index], writable); } catch (EOFException e) { // debugDetailedReadPositionString = // binarySortableDeserializeRead2.getDetailedReadPositionString(); // debugStackTrace = e.getStackTrace(); threw = true; } TestCase.assertTrue(threw); } else { if (useIncludeColumns && !columnsToInclude[index]) { binarySortableDeserializeRead2.skipNextField(); } else { VerifyFast.verifyDeserializeRead( binarySortableDeserializeRead2, primitiveTypeInfos[index], writable); } } } } // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite. for (int i = 0; i < rowCount; i++) { BytesWritable bytesWritable = serializeWriteBytes[i]; // Note that regular SerDe doesn't tolerate fewer columns. List<Object> deserializedRow; if (doWriteFewerColumns) { deserializedRow = (List<Object>) serde_fewer.deserialize(bytesWritable); } else { deserializedRow = (List<Object>) serde.deserialize(bytesWritable); } Object[] row = rows[i]; for (int index = 0; index < writeColumnCount; index++) { Object expected = row[index]; Object object = deserializedRow.get(index); if (expected == null || object == null) { if (expected != null || object != null) { fail("SerDe deserialized NULL column mismatch"); } } else { if (!object.equals(expected)) { fail( "SerDe deserialized value does not match (expected " + expected.getClass().getName() + " " + expected.toString() + ", actual " + object.getClass().getName() + " " + object.toString() + ")"); } } } } // One Writable per row. BytesWritable serdeBytes[] = new BytesWritable[rowCount]; // Serialize using the SerDe, then below deserialize using DeserializeRead. for (int i = 0; i < rowCount; i++) { Object[] row = rows[i]; // Since SerDe reuses memory, we will need to make a copy. BytesWritable serialized; if (doWriteFewerColumns) { serialized = (BytesWritable) serde_fewer.serialize(row, rowOI); } else { serialized = (BytesWritable) serde.serialize(row, rowOI); ; } BytesWritable bytesWritable = new BytesWritable(); bytesWritable.set(serialized); byte[] serDeOutput = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength()); byte[] serializeWriteExpected = Arrays.copyOfRange( serializeWriteBytes[i].getBytes(), 0, serializeWriteBytes[i].getLength()); if (!Arrays.equals(serDeOutput, serializeWriteExpected)) { int mismatchPos = -1; if (serDeOutput.length != serializeWriteExpected.length) { for (int b = 0; b < Math.min(serDeOutput.length, serializeWriteExpected.length); b++) { if (serDeOutput[b] != serializeWriteExpected[b]) { mismatchPos = b; break; } } fail( "Different byte array lengths: serDeOutput.length " + serDeOutput.length + ", serializeWriteExpected.length " + serializeWriteExpected.length + " mismatchPos " + mismatchPos + " perFieldWriteLengths " + Arrays.toString(perFieldWriteLengthsArray[i])); } List<Integer> differentPositions = new ArrayList(); for (int b = 0; b < serDeOutput.length; b++) { if (serDeOutput[b] != serializeWriteExpected[b]) { differentPositions.add(b); } } if (differentPositions.size() > 0) { List<String> serializeWriteExpectedFields = new ArrayList<String>(); List<String> serDeFields = new ArrayList<String>(); int f = 0; int lastBegin = 0; for (int b = 0; b < serDeOutput.length; b++) { int writeLength = perFieldWriteLengthsArray[i][f]; if (b + 1 == writeLength) { serializeWriteExpectedFields.add( displayBytes(serializeWriteExpected, lastBegin, writeLength - lastBegin)); serDeFields.add(displayBytes(serDeOutput, lastBegin, writeLength - lastBegin)); f++; lastBegin = b + 1; } } fail( "SerializeWrite and SerDe serialization does not match at positions " + differentPositions.toString() + "\n(SerializeWrite: " + serializeWriteExpectedFields.toString() + "\nSerDe: " + serDeFields.toString() + "\nperFieldWriteLengths " + Arrays.toString(perFieldWriteLengthsArray[i]) + "\nprimitiveTypeInfos " + Arrays.toString(primitiveTypeInfos) + "\nrow " + Arrays.toString(row)); } } serdeBytes[i] = bytesWritable; } // Try to deserialize using DeserializeRead our Writable row objects created by SerDe. for (int i = 0; i < rowCount; i++) { Object[] row = rows[i]; BinarySortableDeserializeRead binarySortableDeserializeRead = new BinarySortableDeserializeRead( primitiveTypeInfos, /* useExternalBuffer */ false, columnSortOrderIsDesc); BytesWritable bytesWritable = serdeBytes[i]; binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength()); for (int index = 0; index < columnCount; index++) { if (useIncludeColumns && !columnsToInclude[index]) { binarySortableDeserializeRead.skipNextField(); } else if (index >= writeColumnCount) { // Should come back a null. VerifyFast.verifyDeserializeRead( binarySortableDeserializeRead, primitiveTypeInfos[index], null); } else { Writable writable = (Writable) row[index]; VerifyFast.verifyDeserializeRead( binarySortableDeserializeRead, primitiveTypeInfos[index], writable); } } if (writeColumnCount == columnCount) { TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached()); } } }