public int performRCFileReadFirstAndLastColumnTest(
      FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException {

    byte[][] checkBytes = null;
    BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
    if (chechCorrect) {
      resetRandomGenerators();
      checkBytes = new byte[allColumnsNumber][];
    }

    int actualReadCount = 0;

    java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
    readCols.add(Integer.valueOf(0));
    readCols.add(Integer.valueOf(allColumnsNumber - 1));
    ColumnProjectionUtils.appendReadColumns(conf, readCols);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      boolean ok = true;
      if (chechCorrect) {
        nextRandomRow(checkBytes, checkRow);
        ok = ok && (checkRow.get(0).equals(cols.get(0)));
        ok = ok && checkRow.get(allColumnsNumber - 1).equals(cols.get(allColumnsNumber - 1));
      }
      if (!ok) {
        throw new IllegalStateException("Compare read and write error.");
      }
      actualReadCount++;
    }
    return actualReadCount;
  }
Exemple #2
0
  public long doRead(MultiFileReader reader, Logger log)
      throws IOException, SerDeException, InterruptedException {
    long ts;
    totalRowReadTimeInNano = 0;
    totalInitializationTimeInNano = 0;
    totalCalculateSizeTimeInNano = 0;
    totalDataReadTimeInNano = 0;
    // assert totalRowDeserializationTimeInNano == 0;

    ts = System.nanoTime();
    Map<String, BytesRefArrayWritable> ret = new HashMap<String, BytesRefArrayWritable>();
    List<ColumnFileGroup> groups = reader.getColumnFileGroups();
    Map<String, List<Integer>> readColumns = reader.getReadColumns();
    for (ColumnFileGroup group : groups) {
      if (!readColumns.keySet().contains(group.getName())) {
        continue;
      }
      BytesRefArrayWritable braw = new BytesRefArrayWritable(group.getColumns().size());
      braw.resetValid(group.getColumns().size());
      ret.put(group.getName(), braw);
    }
    LongWritable rowID = new LongWritable();
    rowCount = 0;
    long totalSerializedDataSize = 0;
    totalInitializationTimeInNano = (System.nanoTime() - ts);

    long start = System.nanoTime();
    while (reader.next(rowID)) {
      ts = System.nanoTime();
      reader.getCurrentRow(ret);
      totalRowReadTimeInNano += (System.nanoTime() - ts);

      ts = System.nanoTime();
      for (Entry<String, BytesRefArrayWritable> entry : ret.entrySet()) {
        String groupName = entry.getKey();
        BytesRefArrayWritable braw = entry.getValue();
        // ts = System.nanoTime();
        // ColumnarSerDeBase serde = reader.getGroupSerDe(groupName);
        // serde.deserialize(braw);
        // totalRowDeserializationTimeInNano += System.nanoTime() - ts;
        for (Integer col : readColumns.get(groupName)) {
          totalSerializedDataSize += braw.get(col).getLength();
        }
      }
      totalCalculateSizeTimeInNano += (System.nanoTime() - ts);

      rowCount++;
      if (thinkTime > 0 && rowCount % thinkInterval == 0) {
        Thread.sleep(thinkTime);
      }
    }
    totalDataReadTimeInNano = System.nanoTime() - start;
    ts = System.nanoTime();
    reader.close();
    readerCloseTimeInNano = System.nanoTime() - ts;

    log.info("Row count : " + rowCount);
    log.info("Total serialized data size: " + totalSerializedDataSize);
    return totalSerializedDataSize;
  }
    public void reduce(
        Text key,
        Iterator<BytesRefArrayWritable> values,
        OutputCollector<LongWritable, BytesRefArrayWritable> output,
        Reporter reporter)
        throws IOException {

      String keyString = key.toString();
      String keyPrefix = keyString.substring(0, keyString.length() - 1);

      newKey.set(counter++);

      if (keyPrefix.equals(lagPrefixKey) == false) {
        lagPrefixKey = keyPrefix;
        if (values.hasNext()) {
          BytesRefArrayWritable value = values.next();
          BytesRefWritable cell = value.get(0);
          String cellStr =
              new String(cell.getData())
                  .substring(cell.getStart(), cell.getStart() + cell.getLength());

          output.collect(newKey, value);
        }
      }
    }
 private void nextRandomRow(byte[][] row, BytesRefArrayWritable bytes) {
   bytes.resetValid(row.length);
   for (int i = 0; i < row.length; i++) {
     int len = Math.abs(randColLenGenerator.nextInt(columnMaxSize));
     row[i] = new byte[len];
     for (int j = 0; j < len; j++) {
       row[i][j] = getRandomChar(randomCharGenerator);
     }
     bytes.get(i).set(row[i], 0, len);
   }
 }
    /**
     * Builds Thrift object from the raw bytes returned by RCFile reader.
     *
     * @throws TException
     */
    @SuppressWarnings({"unchecked", "rawtypes"})
    public TBase<?, ?> getCurrentThriftValue()
        throws IOException, InterruptedException, TException {

      BytesRefArrayWritable byteRefs = getCurrentBytesRefArrayWritable();

      if (byteRefs == null) {
        return null;
      }

      TBase tObj = tDesc.newThriftObject();

      for (int i = 0; i < knownRequiredFields.size(); i++) {
        BytesRefWritable buf = byteRefs.get(columnsBeingRead.get(i));
        if (buf.getLength() > 0) {
          memTransport.reset(buf.getData(), buf.getStart(), buf.getLength());

          Field field = knownRequiredFields.get(i);
          tObj.setFieldValue(field.getFieldIdEnum(), ThriftUtils.readFieldNoTag(tProto, field));
        }
        // else no need to set default value since any default value
        // would have been serialized when this record was written.
      }

      // parse unknowns column if required
      if (readUnknownsColumn) {
        int last = columnsBeingRead.get(columnsBeingRead.size() - 1);
        BytesRefWritable buf = byteRefs.get(last);
        if (buf.getLength() > 0) {
          memTransport.reset(buf.getData(), buf.getStart(), buf.getLength());
          tObj.read(tProto);
        }
      }

      return tObj;
    }
    public void map(
        LongWritable key,
        BytesRefArrayWritable value,
        OutputCollector<Text, BytesRefArrayWritable> output,
        Reporter reporter)
        throws IOException {

      StringBuilder keyBuilder = new StringBuilder();
      for (int i = 0; i < primaryKeyIndexes.length; i++) {
        BytesRefWritable cell = value.get(primaryKeyIndexes[i]);
        String cellStr =
            new String(cell.getData())
                .substring(cell.getStart(), cell.getStart() + cell.getLength());
        // We are only looking for a perfect match
        keyBuilder.append(cellStr + zeroChar);
      }

      keyBuilder.append(isDeltaChar);
      newKey.set(keyBuilder.toString());

      output.collect(newKey, value);
    }