public int performRCFileReadFirstAndLastColumnTest( FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException { byte[][] checkBytes = null; BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber); if (chechCorrect) { resetRandomGenerators(); checkBytes = new byte[allColumnsNumber][]; } int actualReadCount = 0; java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>(); readCols.add(Integer.valueOf(0)); readCols.add(Integer.valueOf(allColumnsNumber - 1)); ColumnProjectionUtils.appendReadColumns(conf, readCols); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); boolean ok = true; if (chechCorrect) { nextRandomRow(checkBytes, checkRow); ok = ok && (checkRow.get(0).equals(cols.get(0))); ok = ok && checkRow.get(allColumnsNumber - 1).equals(cols.get(allColumnsNumber - 1)); } if (!ok) { throw new IllegalStateException("Compare read and write error."); } actualReadCount++; } return actualReadCount; }
public long doRead(MultiFileReader reader, Logger log) throws IOException, SerDeException, InterruptedException { long ts; totalRowReadTimeInNano = 0; totalInitializationTimeInNano = 0; totalCalculateSizeTimeInNano = 0; totalDataReadTimeInNano = 0; // assert totalRowDeserializationTimeInNano == 0; ts = System.nanoTime(); Map<String, BytesRefArrayWritable> ret = new HashMap<String, BytesRefArrayWritable>(); List<ColumnFileGroup> groups = reader.getColumnFileGroups(); Map<String, List<Integer>> readColumns = reader.getReadColumns(); for (ColumnFileGroup group : groups) { if (!readColumns.keySet().contains(group.getName())) { continue; } BytesRefArrayWritable braw = new BytesRefArrayWritable(group.getColumns().size()); braw.resetValid(group.getColumns().size()); ret.put(group.getName(), braw); } LongWritable rowID = new LongWritable(); rowCount = 0; long totalSerializedDataSize = 0; totalInitializationTimeInNano = (System.nanoTime() - ts); long start = System.nanoTime(); while (reader.next(rowID)) { ts = System.nanoTime(); reader.getCurrentRow(ret); totalRowReadTimeInNano += (System.nanoTime() - ts); ts = System.nanoTime(); for (Entry<String, BytesRefArrayWritable> entry : ret.entrySet()) { String groupName = entry.getKey(); BytesRefArrayWritable braw = entry.getValue(); // ts = System.nanoTime(); // ColumnarSerDeBase serde = reader.getGroupSerDe(groupName); // serde.deserialize(braw); // totalRowDeserializationTimeInNano += System.nanoTime() - ts; for (Integer col : readColumns.get(groupName)) { totalSerializedDataSize += braw.get(col).getLength(); } } totalCalculateSizeTimeInNano += (System.nanoTime() - ts); rowCount++; if (thinkTime > 0 && rowCount % thinkInterval == 0) { Thread.sleep(thinkTime); } } totalDataReadTimeInNano = System.nanoTime() - start; ts = System.nanoTime(); reader.close(); readerCloseTimeInNano = System.nanoTime() - ts; log.info("Row count : " + rowCount); log.info("Total serialized data size: " + totalSerializedDataSize); return totalSerializedDataSize; }
public void reduce( Text key, Iterator<BytesRefArrayWritable> values, OutputCollector<LongWritable, BytesRefArrayWritable> output, Reporter reporter) throws IOException { String keyString = key.toString(); String keyPrefix = keyString.substring(0, keyString.length() - 1); newKey.set(counter++); if (keyPrefix.equals(lagPrefixKey) == false) { lagPrefixKey = keyPrefix; if (values.hasNext()) { BytesRefArrayWritable value = values.next(); BytesRefWritable cell = value.get(0); String cellStr = new String(cell.getData()) .substring(cell.getStart(), cell.getStart() + cell.getLength()); output.collect(newKey, value); } } }
private void nextRandomRow(byte[][] row, BytesRefArrayWritable bytes) { bytes.resetValid(row.length); for (int i = 0; i < row.length; i++) { int len = Math.abs(randColLenGenerator.nextInt(columnMaxSize)); row[i] = new byte[len]; for (int j = 0; j < len; j++) { row[i][j] = getRandomChar(randomCharGenerator); } bytes.get(i).set(row[i], 0, len); } }
/** * Builds Thrift object from the raw bytes returned by RCFile reader. * * @throws TException */ @SuppressWarnings({"unchecked", "rawtypes"}) public TBase<?, ?> getCurrentThriftValue() throws IOException, InterruptedException, TException { BytesRefArrayWritable byteRefs = getCurrentBytesRefArrayWritable(); if (byteRefs == null) { return null; } TBase tObj = tDesc.newThriftObject(); for (int i = 0; i < knownRequiredFields.size(); i++) { BytesRefWritable buf = byteRefs.get(columnsBeingRead.get(i)); if (buf.getLength() > 0) { memTransport.reset(buf.getData(), buf.getStart(), buf.getLength()); Field field = knownRequiredFields.get(i); tObj.setFieldValue(field.getFieldIdEnum(), ThriftUtils.readFieldNoTag(tProto, field)); } // else no need to set default value since any default value // would have been serialized when this record was written. } // parse unknowns column if required if (readUnknownsColumn) { int last = columnsBeingRead.get(columnsBeingRead.size() - 1); BytesRefWritable buf = byteRefs.get(last); if (buf.getLength() > 0) { memTransport.reset(buf.getData(), buf.getStart(), buf.getLength()); tObj.read(tProto); } } return tObj; }
public void map( LongWritable key, BytesRefArrayWritable value, OutputCollector<Text, BytesRefArrayWritable> output, Reporter reporter) throws IOException { StringBuilder keyBuilder = new StringBuilder(); for (int i = 0; i < primaryKeyIndexes.length; i++) { BytesRefWritable cell = value.get(primaryKeyIndexes[i]); String cellStr = new String(cell.getData()) .substring(cell.getStart(), cell.getStart() + cell.getLength()); // We are only looking for a perfect match keyBuilder.append(cellStr + zeroChar); } keyBuilder.append(isDeltaChar); newKey.set(keyBuilder.toString()); output.collect(newKey, value); }