예제 #1
0
    public JoinUtil.JoinResult setFromOutput(Output output) {

      aliasFilter = hashMap.getValueResult(output.getData(), 0, output.getLength(), hashMapResult);
      dummyRow = null;
      if (hashMapResult.hasRows()) {
        return JoinUtil.JoinResult.MATCH;
      } else {
        aliasFilter = (byte) 0xff;
        return JoinUtil.JoinResult.NOMATCH;
      }
    }
  @Override
  public void process(Object row, int tag) throws HiveException {

    try {
      VectorizedRowBatch batch = (VectorizedRowBatch) row;

      alias = (byte) tag;

      if (needCommonSetup) {
        // Our one time process method initialization.
        commonSetup(batch);

        /*
         * Initialize Multi-Key members for this specialized class.
         */

        keyVectorSerializeWrite =
            new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length));
        keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap);

        currentKeyOutput = new Output();
        saveKeyOutput = new Output();

        needCommonSetup = false;
      }

      if (needHashTableSetup) {
        // Setup our hash table specialization.  It will be the first time the process
        // method is called, or after a Hybrid Grace reload.

        /*
         * Get our Multi-Key hash map information for this specialized class.
         */

        hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable;

        needHashTableSetup = false;
      }

      batchCounter++;

      final int inputLogicalSize = batch.size;

      if (inputLogicalSize == 0) {
        if (isLogDebugEnabled) {
          LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
        }
        return;
      }

      // Do the per-batch setup for an outer join.

      outerPerBatchSetup(batch);

      // For outer join, remember our input rows before ON expression filtering or before
      // hash table matching so we can generate results for all rows (matching and non matching)
      // later.
      boolean inputSelectedInUse = batch.selectedInUse;
      if (inputSelectedInUse) {
        // if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) {
        //   throw new HiveException("batch.selected is not in sort order and unique");
        // }
        System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize);
      }

      // Filtering for outer join just removes rows available for hash table matching.
      boolean someRowsFilteredOut = false;
      if (bigTableFilterExpressions.length > 0) {
        // Since the input
        for (VectorExpression ve : bigTableFilterExpressions) {
          ve.evaluate(batch);
        }
        someRowsFilteredOut = (batch.size != inputLogicalSize);
        if (isLogDebugEnabled) {
          if (batch.selectedInUse) {
            if (inputSelectedInUse) {
              LOG.debug(
                  CLASS_NAME
                      + " inputSelected "
                      + intArrayToRangesString(inputSelected, inputLogicalSize)
                      + " filtered batch.selected "
                      + intArrayToRangesString(batch.selected, batch.size));
            } else {
              LOG.debug(
                  CLASS_NAME
                      + " inputLogicalSize "
                      + inputLogicalSize
                      + " filtered batch.selected "
                      + intArrayToRangesString(batch.selected, batch.size));
            }
          }
        }
      }

      // Perform any key expressions.  Results will go into scratch columns.
      if (bigTableKeyExpressions != null) {
        for (VectorExpression ve : bigTableKeyExpressions) {
          ve.evaluate(batch);
        }
      }

      /*
       * Multi-Key specific declarations.
       */

      // None.

      /*
       * Multi-Key Long check for repeating.
       */

      // If all BigTable input columns to key expressions are isRepeating, then
      // calculate key once; lookup once.
      // Also determine if any nulls are present since for a join that means no match.
      boolean allKeyInputColumnsRepeating;
      boolean someKeyInputColumnIsNull =
          false; // Only valid if allKeyInputColumnsRepeating is true.
      if (bigTableKeyColumnMap.length == 0) {
        allKeyInputColumnsRepeating = false;
      } else {
        allKeyInputColumnsRepeating = true;
        for (int i = 0; i < bigTableKeyColumnMap.length; i++) {
          ColumnVector colVector = batch.cols[bigTableKeyColumnMap[i]];
          if (!colVector.isRepeating) {
            allKeyInputColumnsRepeating = false;
            break;
          }
          if (!colVector.noNulls && colVector.isNull[0]) {
            someKeyInputColumnIsNull = true;
          }
        }
      }

      if (allKeyInputColumnsRepeating) {

        /*
         * Repeating.
         */

        // All key input columns are repeating.  Generate key once.  Lookup once.
        // Since the key is repeated, we must use entry 0 regardless of selectedInUse.

        /*
         * Multi-Key specific repeated lookup.
         */

        JoinUtil.JoinResult joinResult;
        if (batch.size == 0) {
          // Whole repeated key batch was filtered out.
          joinResult = JoinUtil.JoinResult.NOMATCH;
        } else if (someKeyInputColumnIsNull) {
          // Any (repeated) null key column is no match for whole batch.
          joinResult = JoinUtil.JoinResult.NOMATCH;
        } else {

          // All key input columns are repeating.  Generate key once.  Lookup once.
          keyVectorSerializeWrite.setOutput(currentKeyOutput);
          keyVectorSerializeWrite.serializeWrite(batch, 0);
          byte[] keyBytes = currentKeyOutput.getData();
          int keyLength = currentKeyOutput.getLength();
          joinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[0]);
        }

        /*
         * Common repeated join result processing.
         */

        if (isLogDebugEnabled) {
          LOG.debug(
              CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
        }
        finishOuterRepeated(
            batch,
            joinResult,
            hashMapResults[0],
            someRowsFilteredOut,
            inputSelectedInUse,
            inputLogicalSize);
      } else {

        /*
         * NOT Repeating.
         */

        if (isLogDebugEnabled) {
          LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
        }

        int selected[] = batch.selected;
        boolean selectedInUse = batch.selectedInUse;

        int hashMapResultCount = 0;
        int allMatchCount = 0;
        int equalKeySeriesCount = 0;
        int spillCount = 0;

        boolean atLeastOneNonMatch = someRowsFilteredOut;

        /*
         * Multi-Key specific variables.
         */

        Output temp;

        // We optimize performance by only looking up the first key in a series of equal keys.
        boolean haveSaveKey = false;
        JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;

        // Logical loop over the rows in the batch since the batch may have selected in use.
        for (int logical = 0; logical < batch.size; logical++) {
          int batchIndex = (selectedInUse ? selected[logical] : logical);

          // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " +
          // getOperatorId() + " candidate " + CLASS_NAME + " batch");

          /*
           * Multi-Key outer null detection.
           */

          // Generate binary sortable key for current row in vectorized row batch.
          keyVectorSerializeWrite.setOutput(currentKeyOutput);
          keyVectorSerializeWrite.serializeWrite(batch, batchIndex);
          if (keyVectorSerializeWrite.getHasAnyNulls()) {

            // Have that the NULL does not interfere with the current equal key series, if there
            // is one. We do not set saveJoinResult.
            //
            //    Let a current MATCH equal key series keep going, or
            //    Let a current SPILL equal key series keep going, or
            //    Let a current NOMATCH keep not matching.

            atLeastOneNonMatch = true;

            // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + "
            // NULL");
          } else {

            /*
             * Multi-Key outer get key.
             */

            // Generated earlier to get possible null(s).

            /*
             * Equal key series checking.
             */

            if (!haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) {

              // New key.

              if (haveSaveKey) {
                // Move on with our counts.
                switch (saveJoinResult) {
                  case MATCH:
                    hashMapResultCount++;
                    equalKeySeriesCount++;
                    break;
                  case SPILL:
                    hashMapResultCount++;
                    break;
                  case NOMATCH:
                    break;
                }
              }

              // Regardless of our matching result, we keep that information to make multiple use
              // of it for a possible series of equal keys.
              haveSaveKey = true;

              /*
               * Multi-Key specific save key.
               */

              temp = saveKeyOutput;
              saveKeyOutput = currentKeyOutput;
              currentKeyOutput = temp;

              /*
               * Multi-Key specific lookup key.
               */

              byte[] keyBytes = saveKeyOutput.getData();
              int keyLength = saveKeyOutput.getLength();
              saveJoinResult =
                  hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[hashMapResultCount]);

              /*
               * Common outer join result processing.
               */

              switch (saveJoinResult) {
                case MATCH:
                  equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount;
                  equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
                  equalKeySeriesIsSingleValue[equalKeySeriesCount] =
                      hashMapResults[hashMapResultCount].isSingleRow();
                  equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
                  allMatchs[allMatchCount++] = batchIndex;
                  // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH
                  // isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + "
                  // currentKey " + currentKey);
                  break;

                case SPILL:
                  spills[spillCount] = batchIndex;
                  spillHashMapResultIndices[spillCount] = hashMapResultCount;
                  spillCount++;
                  break;

                case NOMATCH:
                  atLeastOneNonMatch = true;
                  // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + "
                  // NOMATCH" + " currentKey " + currentKey);
                  break;
              }
            } else {
              // LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " Key
              // Continues " + saveKey + " " + saveJoinResult.name());

              // Series of equal keys.

              switch (saveJoinResult) {
                case MATCH:
                  equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
                  allMatchs[allMatchCount++] = batchIndex;
                  // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH
                  // duplicate");
                  break;

                case SPILL:
                  spills[spillCount] = batchIndex;
                  spillHashMapResultIndices[spillCount] = hashMapResultCount;
                  spillCount++;
                  break;

                case NOMATCH:
                  // VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + "
                  // NOMATCH duplicate");
                  break;
              }
            }
            // if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) {
            //   throw new HiveException("allMatchs is not in sort order and unique");
            // }
          }
        }

        if (haveSaveKey) {
          // Update our counts for the last key.
          switch (saveJoinResult) {
            case MATCH:
              hashMapResultCount++;
              equalKeySeriesCount++;
              break;
            case SPILL:
              hashMapResultCount++;
              break;
            case NOMATCH:
              break;
          }
        }

        if (isLogDebugEnabled) {
          LOG.debug(
              CLASS_NAME
                  + " batch #"
                  + batchCounter
                  + " allMatchs "
                  + intArrayToRangesString(allMatchs, allMatchCount)
                  + " equalKeySeriesHashMapResultIndices "
                  + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount)
                  + " equalKeySeriesAllMatchIndices "
                  + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount)
                  + " equalKeySeriesIsSingleValue "
                  + Arrays.toString(
                      Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount))
                  + " equalKeySeriesDuplicateCounts "
                  + Arrays.toString(
                      Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount))
                  + " atLeastOneNonMatch "
                  + atLeastOneNonMatch
                  + " inputSelectedInUse "
                  + inputSelectedInUse
                  + " inputLogicalSize "
                  + inputLogicalSize
                  + " spills "
                  + intArrayToRangesString(spills, spillCount)
                  + " spillHashMapResultIndices "
                  + intArrayToRangesString(spillHashMapResultIndices, spillCount)
                  + " hashMapResults "
                  + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount)));
        }

        // We will generate results for all matching and non-matching rows.
        finishOuter(
            batch,
            allMatchCount,
            equalKeySeriesCount,
            atLeastOneNonMatch,
            inputSelectedInUse,
            inputLogicalSize,
            spillCount,
            hashMapResultCount);
      }

      if (batch.size > 0) {
        // Forward any remaining selected rows.
        forwardBigTableBatch(batch);
      }

    } catch (IOException e) {
      throw new HiveException(e);
    } catch (Exception e) {
      throw new HiveException(e);
    }
  }
예제 #3
0
  private void testBinarySortableFast(
      SerdeRandomRowSource source,
      Object[][] rows,
      boolean[] columnSortOrderIsDesc,
      byte[] columnNullMarker,
      byte[] columnNotNullMarker,
      AbstractSerDe serde,
      StructObjectInspector rowOI,
      AbstractSerDe serde_fewer,
      StructObjectInspector writeRowOI,
      boolean ascending,
      PrimitiveTypeInfo[] primitiveTypeInfos,
      boolean useIncludeColumns,
      boolean doWriteFewerColumns,
      Random r)
      throws Throwable {

    int rowCount = rows.length;
    int columnCount = primitiveTypeInfos.length;

    boolean[] columnsToInclude = null;
    if (useIncludeColumns) {
      columnsToInclude = new boolean[columnCount];
      for (int i = 0; i < columnCount; i++) {
        columnsToInclude[i] = r.nextBoolean();
      }
    }

    int writeColumnCount = columnCount;
    if (doWriteFewerColumns) {
      writeColumnCount = writeRowOI.getAllStructFieldRefs().size();
    }

    BinarySortableSerializeWrite binarySortableSerializeWrite =
        new BinarySortableSerializeWrite(
            columnSortOrderIsDesc, columnNullMarker, columnNotNullMarker);

    // Try to serialize

    // One Writable per row.
    BytesWritable serializeWriteBytes[] = new BytesWritable[rowCount];

    int[][] perFieldWriteLengthsArray = new int[rowCount][];
    for (int i = 0; i < rowCount; i++) {
      Object[] row = rows[i];
      Output output = new Output();
      binarySortableSerializeWrite.set(output);

      int[] perFieldWriteLengths = new int[columnCount];
      for (int index = 0; index < writeColumnCount; index++) {

        Writable writable = (Writable) row[index];

        VerifyFast.serializeWrite(
            binarySortableSerializeWrite, primitiveTypeInfos[index], writable);
        perFieldWriteLengths[index] = output.getLength();
      }
      perFieldWriteLengthsArray[i] = perFieldWriteLengths;

      BytesWritable bytesWritable = new BytesWritable();
      bytesWritable.set(output.getData(), 0, output.getLength());
      serializeWriteBytes[i] = bytesWritable;
      if (i > 0) {
        int compareResult = serializeWriteBytes[i - 1].compareTo(serializeWriteBytes[i]);
        if ((compareResult < 0 && !ascending) || (compareResult > 0 && ascending)) {
          System.out.println(
              "Test failed in "
                  + (ascending ? "ascending" : "descending")
                  + " order with "
                  + (i - 1)
                  + " and "
                  + i);
          System.out.println(
              "serialized data ["
                  + (i - 1)
                  + "] = "
                  + TestBinarySortableSerDe.hexString(serializeWriteBytes[i - 1]));
          System.out.println(
              "serialized data ["
                  + i
                  + "] = "
                  + TestBinarySortableSerDe.hexString(serializeWriteBytes[i]));
          fail("Sort order of serialized " + (i - 1) + " and " + i + " are reversed!");
        }
      }
    }

    // Try to deserialize using DeserializeRead our Writable row objects created by SerializeWrite.
    for (int i = 0; i < rowCount; i++) {
      Object[] row = rows[i];
      BinarySortableDeserializeRead binarySortableDeserializeRead =
          new BinarySortableDeserializeRead(
              primitiveTypeInfos, /* useExternalBuffer */ false, columnSortOrderIsDesc);

      BytesWritable bytesWritable = serializeWriteBytes[i];
      binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());

      for (int index = 0; index < columnCount; index++) {
        if (useIncludeColumns && !columnsToInclude[index]) {
          binarySortableDeserializeRead.skipNextField();
        } else if (index >= writeColumnCount) {
          // Should come back a null.
          VerifyFast.verifyDeserializeRead(
              binarySortableDeserializeRead, primitiveTypeInfos[index], null);
        } else {
          Writable writable = (Writable) row[index];
          VerifyFast.verifyDeserializeRead(
              binarySortableDeserializeRead, primitiveTypeInfos[index], writable);
        }
      }
      if (writeColumnCount == columnCount) {
        TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached());
      }

      /*
       * Clip off one byte and expect to get an EOFException on the write field.
       */
      BinarySortableDeserializeRead binarySortableDeserializeRead2 =
          new BinarySortableDeserializeRead(
              primitiveTypeInfos, /* useExternalBuffer */ false, columnSortOrderIsDesc);

      binarySortableDeserializeRead2.set(
          bytesWritable.getBytes(), 0, bytesWritable.getLength() - 1); // One fewer byte.

      for (int index = 0; index < writeColumnCount; index++) {
        Writable writable = (Writable) row[index];
        if (index == writeColumnCount - 1) {
          boolean threw = false;
          try {
            VerifyFast.verifyDeserializeRead(
                binarySortableDeserializeRead2, primitiveTypeInfos[index], writable);
          } catch (EOFException e) {
            //          debugDetailedReadPositionString =
            // binarySortableDeserializeRead2.getDetailedReadPositionString();
            //          debugStackTrace = e.getStackTrace();
            threw = true;
          }
          TestCase.assertTrue(threw);
        } else {
          if (useIncludeColumns && !columnsToInclude[index]) {
            binarySortableDeserializeRead2.skipNextField();
          } else {
            VerifyFast.verifyDeserializeRead(
                binarySortableDeserializeRead2, primitiveTypeInfos[index], writable);
          }
        }
      }
    }

    // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite.
    for (int i = 0; i < rowCount; i++) {
      BytesWritable bytesWritable = serializeWriteBytes[i];

      // Note that regular SerDe doesn't tolerate fewer columns.
      List<Object> deserializedRow;
      if (doWriteFewerColumns) {
        deserializedRow = (List<Object>) serde_fewer.deserialize(bytesWritable);
      } else {
        deserializedRow = (List<Object>) serde.deserialize(bytesWritable);
      }

      Object[] row = rows[i];
      for (int index = 0; index < writeColumnCount; index++) {
        Object expected = row[index];
        Object object = deserializedRow.get(index);
        if (expected == null || object == null) {
          if (expected != null || object != null) {
            fail("SerDe deserialized NULL column mismatch");
          }
        } else {
          if (!object.equals(expected)) {
            fail(
                "SerDe deserialized value does not match (expected "
                    + expected.getClass().getName()
                    + " "
                    + expected.toString()
                    + ", actual "
                    + object.getClass().getName()
                    + " "
                    + object.toString()
                    + ")");
          }
        }
      }
    }

    // One Writable per row.
    BytesWritable serdeBytes[] = new BytesWritable[rowCount];

    // Serialize using the SerDe, then below deserialize using DeserializeRead.
    for (int i = 0; i < rowCount; i++) {
      Object[] row = rows[i];

      // Since SerDe reuses memory, we will need to make a copy.
      BytesWritable serialized;
      if (doWriteFewerColumns) {
        serialized = (BytesWritable) serde_fewer.serialize(row, rowOI);
      } else {
        serialized = (BytesWritable) serde.serialize(row, rowOI);
        ;
      }
      BytesWritable bytesWritable = new BytesWritable();
      bytesWritable.set(serialized);
      byte[] serDeOutput =
          Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength());

      byte[] serializeWriteExpected =
          Arrays.copyOfRange(
              serializeWriteBytes[i].getBytes(), 0, serializeWriteBytes[i].getLength());
      if (!Arrays.equals(serDeOutput, serializeWriteExpected)) {
        int mismatchPos = -1;
        if (serDeOutput.length != serializeWriteExpected.length) {
          for (int b = 0; b < Math.min(serDeOutput.length, serializeWriteExpected.length); b++) {
            if (serDeOutput[b] != serializeWriteExpected[b]) {
              mismatchPos = b;
              break;
            }
          }
          fail(
              "Different byte array lengths: serDeOutput.length "
                  + serDeOutput.length
                  + ", serializeWriteExpected.length "
                  + serializeWriteExpected.length
                  + " mismatchPos "
                  + mismatchPos
                  + " perFieldWriteLengths "
                  + Arrays.toString(perFieldWriteLengthsArray[i]));
        }
        List<Integer> differentPositions = new ArrayList();
        for (int b = 0; b < serDeOutput.length; b++) {
          if (serDeOutput[b] != serializeWriteExpected[b]) {
            differentPositions.add(b);
          }
        }
        if (differentPositions.size() > 0) {
          List<String> serializeWriteExpectedFields = new ArrayList<String>();
          List<String> serDeFields = new ArrayList<String>();
          int f = 0;
          int lastBegin = 0;
          for (int b = 0; b < serDeOutput.length; b++) {
            int writeLength = perFieldWriteLengthsArray[i][f];
            if (b + 1 == writeLength) {
              serializeWriteExpectedFields.add(
                  displayBytes(serializeWriteExpected, lastBegin, writeLength - lastBegin));
              serDeFields.add(displayBytes(serDeOutput, lastBegin, writeLength - lastBegin));
              f++;
              lastBegin = b + 1;
            }
          }
          fail(
              "SerializeWrite and SerDe serialization does not match at positions "
                  + differentPositions.toString()
                  + "\n(SerializeWrite: "
                  + serializeWriteExpectedFields.toString()
                  + "\nSerDe: "
                  + serDeFields.toString()
                  + "\nperFieldWriteLengths "
                  + Arrays.toString(perFieldWriteLengthsArray[i])
                  + "\nprimitiveTypeInfos "
                  + Arrays.toString(primitiveTypeInfos)
                  + "\nrow "
                  + Arrays.toString(row));
        }
      }
      serdeBytes[i] = bytesWritable;
    }

    // Try to deserialize using DeserializeRead our Writable row objects created by SerDe.
    for (int i = 0; i < rowCount; i++) {
      Object[] row = rows[i];
      BinarySortableDeserializeRead binarySortableDeserializeRead =
          new BinarySortableDeserializeRead(
              primitiveTypeInfos, /* useExternalBuffer */ false, columnSortOrderIsDesc);

      BytesWritable bytesWritable = serdeBytes[i];
      binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());

      for (int index = 0; index < columnCount; index++) {
        if (useIncludeColumns && !columnsToInclude[index]) {
          binarySortableDeserializeRead.skipNextField();
        } else if (index >= writeColumnCount) {
          // Should come back a null.
          VerifyFast.verifyDeserializeRead(
              binarySortableDeserializeRead, primitiveTypeInfos[index], null);
        } else {
          Writable writable = (Writable) row[index];
          VerifyFast.verifyDeserializeRead(
              binarySortableDeserializeRead, primitiveTypeInfos[index], writable);
        }
      }
      if (writeColumnCount == columnCount) {
        TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached());
      }
    }
  }