public long doRead(MultiFileReader reader, Logger log) throws IOException, SerDeException, InterruptedException { long ts; totalRowReadTimeInNano = 0; totalInitializationTimeInNano = 0; totalCalculateSizeTimeInNano = 0; totalDataReadTimeInNano = 0; // assert totalRowDeserializationTimeInNano == 0; ts = System.nanoTime(); Map<String, BytesRefArrayWritable> ret = new HashMap<String, BytesRefArrayWritable>(); List<ColumnFileGroup> groups = reader.getColumnFileGroups(); Map<String, List<Integer>> readColumns = reader.getReadColumns(); for (ColumnFileGroup group : groups) { if (!readColumns.keySet().contains(group.getName())) { continue; } BytesRefArrayWritable braw = new BytesRefArrayWritable(group.getColumns().size()); braw.resetValid(group.getColumns().size()); ret.put(group.getName(), braw); } LongWritable rowID = new LongWritable(); rowCount = 0; long totalSerializedDataSize = 0; totalInitializationTimeInNano = (System.nanoTime() - ts); long start = System.nanoTime(); while (reader.next(rowID)) { ts = System.nanoTime(); reader.getCurrentRow(ret); totalRowReadTimeInNano += (System.nanoTime() - ts); ts = System.nanoTime(); for (Entry<String, BytesRefArrayWritable> entry : ret.entrySet()) { String groupName = entry.getKey(); BytesRefArrayWritable braw = entry.getValue(); // ts = System.nanoTime(); // ColumnarSerDeBase serde = reader.getGroupSerDe(groupName); // serde.deserialize(braw); // totalRowDeserializationTimeInNano += System.nanoTime() - ts; for (Integer col : readColumns.get(groupName)) { totalSerializedDataSize += braw.get(col).getLength(); } } totalCalculateSizeTimeInNano += (System.nanoTime() - ts); rowCount++; if (thinkTime > 0 && rowCount % thinkInterval == 0) { Thread.sleep(thinkTime); } } totalDataReadTimeInNano = System.nanoTime() - start; ts = System.nanoTime(); reader.close(); readerCloseTimeInNano = System.nanoTime() - ts; log.info("Row count : " + rowCount); log.info("Total serialized data size: " + totalSerializedDataSize); return totalSerializedDataSize; }
public void reduce( Text key, Iterator<BytesRefArrayWritable> values, OutputCollector<LongWritable, BytesRefArrayWritable> output, Reporter reporter) throws IOException { String keyString = key.toString(); String keyPrefix = keyString.substring(0, keyString.length() - 1); newKey.set(counter++); if (keyPrefix.equals(lagPrefixKey) == false) { lagPrefixKey = keyPrefix; if (values.hasNext()) { BytesRefArrayWritable value = values.next(); BytesRefWritable cell = value.get(0); String cellStr = new String(cell.getData()) .substring(cell.getStart(), cell.getStart() + cell.getLength()); output.collect(newKey, value); } } }
private void writeRCFileTest( FileSystem fs, int rowCount, Path file, int columnNum, CompressionCodec codec) throws IOException { fs.delete(file, true); resetRandomGenerators(); RCFileOutputFormat.setColumnNumber(conf, columnNum); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, codec); byte[][] columnRandom; BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum); columnRandom = new byte[columnNum][]; for (int i = 0; i < columnNum; i++) { BytesRefWritable cu = new BytesRefWritable(); bytes.set(i, cu); } for (int i = 0; i < rowCount; i++) { nextRandomRow(columnRandom, bytes); writer.append(bytes); } writer.close(); }
private void writeSeqenceFileTest( FileSystem fs, int rowCount, Path file, int columnNum, CompressionCodec codec) throws IOException { byte[][] columnRandom; resetRandomGenerators(); BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum); columnRandom = new byte[columnNum][]; for (int i = 0; i < columnNum; i++) { BytesRefWritable cu = new BytesRefWritable(); bytes.set(i, cu); } // zero length key is not allowed by block compress writer, so we use a byte // writable ByteWritable key = new ByteWritable(); SequenceFile.Writer seqWriter = SequenceFile.createWriter( fs, conf, file, ByteWritable.class, BytesRefArrayWritable.class, CompressionType.BLOCK, codec); for (int i = 0; i < rowCount; i++) { nextRandomRow(columnRandom, bytes); seqWriter.append(key, bytes); } seqWriter.close(); }
public int performRCFileFullyReadColumnTest( FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException { byte[][] checkBytes = null; BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber); if (chechCorrect) { resetRandomGenerators(); checkBytes = new byte[allColumnsNumber][]; } int actualReadCount = 0; ColumnProjectionUtils.setReadAllColumns(conf); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); boolean ok = true; if (chechCorrect) { nextRandomRow(checkBytes, checkRow); ok = ok && checkRow.equals(cols); } if (!ok) { throw new IllegalStateException("Compare read and write error."); } actualReadCount++; } return actualReadCount; }
private void writeTest(FileSystem fs, int count, Path file, byte[][] fieldsData) throws IOException, SerDeException { fs.delete(file, true); RCFileOutputFormat.setColumnNumber(conf, fieldsData.length); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec()); BytesRefArrayWritable bytes = new BytesRefArrayWritable(fieldsData.length); for (int i = 0; i < fieldsData.length; i++) { BytesRefWritable cu = null; cu = new BytesRefWritable(fieldsData[i], 0, fieldsData[i].length); bytes.set(i, cu); } for (int i = 0; i < count; i++) { writer.append(bytes); } writer.close(); long fileLen = fs.getFileStatus(file).getLen(); System.out.println( "The file size of RCFile with " + bytes.size() + " number columns and " + count + " number rows is " + fileLen); }
private void nextRandomRow(byte[][] row, BytesRefArrayWritable bytes) { bytes.resetValid(row.length); for (int i = 0; i < row.length; i++) { int len = Math.abs(randColLenGenerator.nextInt(columnMaxSize)); row[i] = new byte[len]; for (int j = 0; j < len; j++) { row[i][j] = getRandomChar(randomCharGenerator); } bytes.get(i).set(row[i], 0, len); } }
public void testReadCorruptFile() throws IOException, SerDeException { fs.delete(file, true); byte[][] record = {null, null, null, null, null, null, null, null}; RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec()); BytesRefArrayWritable bytes = new BytesRefArrayWritable(record.length); final int recCount = 100; Random rand = new Random(); for (int recIdx = 0; recIdx < recCount; recIdx++) { for (int i = 0; i < record.length; i++) { record[i] = new Integer(rand.nextInt()).toString().getBytes("UTF-8"); } for (int i = 0; i < record.length; i++) { BytesRefWritable cu = new BytesRefWritable(record[i], 0, record[i].length); bytes.set(i, cu); } writer.append(bytes); bytes.clear(); } writer.close(); // Insert junk in middle of file. Assumes file is on local disk. RandomAccessFile raf = new RandomAccessFile(file.toUri().getPath(), "rw"); long corruptOffset = raf.length() / 2; LOG.info("corrupting " + raf + " at offset " + corruptOffset); raf.seek(corruptOffset); raf.writeBytes("junkjunkjunkjunkjunkjunkjunkjunk"); raf.close(); // Set the option for tolerating corruptions. The read should succeed. Configuration tmpConf = new Configuration(conf); tmpConf.setBoolean("hive.io.rcfile.tolerate.corruptions", true); RCFile.Reader reader = new RCFile.Reader(fs, file, tmpConf); LongWritable rowID = new LongWritable(); while (true) { boolean more = reader.next(rowID); if (!more) { break; } BytesRefArrayWritable cols = new BytesRefArrayWritable(); reader.getCurrentRow(cols); cols.resetValid(8); } reader.close(); }
private void writeThenReadByRecordReader( int intervalRecordCount, int writeCount, int splitNumber, long minSplitSize, CompressionCodec codec) throws IOException { Path testDir = new Path(System.getProperty("test.data.dir", ".") + "/mapred/testsmallfirstsplit"); Path testFile = new Path(testDir, "test_rcfile"); fs.delete(testFile, true); Configuration cloneConf = new Configuration(conf); RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length); cloneConf.setInt(RCFile.RECORD_INTERVAL_CONF_STR, intervalRecordCount); RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec); BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length); for (int i = 0; i < bytesArray.length; i++) { BytesRefWritable cu = null; cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length); bytes.set(i, cu); } for (int i = 0; i < writeCount; i++) { if (i == intervalRecordCount) { System.out.println("write position:" + writer.getLength()); } writer.append(bytes); } writer.close(); RCFileInputFormat inputFormat = new RCFileInputFormat(); JobConf jonconf = new JobConf(cloneConf); jonconf.set("mapred.input.dir", testDir.toString()); jonconf.setLong("mapred.min.split.size", minSplitSize); InputSplit[] splits = inputFormat.getSplits(jonconf, splitNumber); assertEquals("splits length should be " + splitNumber, splits.length, splitNumber); int readCount = 0; for (int i = 0; i < splits.length; i++) { int previousReadCount = readCount; RecordReader rr = inputFormat.getRecordReader(splits[i], jonconf, Reporter.NULL); Object key = rr.createKey(); Object value = rr.createValue(); while (rr.next(key, value)) { readCount++; } System.out.println("The " + i + "th split read " + (readCount - previousReadCount)); } assertEquals("readCount should be equal to writeCount", readCount, writeCount); }
private void partialReadTest(FileSystem fs, int count, Path file) throws IOException, SerDeException { LOG.debug("reading " + count + " records"); long start = System.currentTimeMillis(); java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>(); readCols.add(Integer.valueOf(2)); readCols.add(Integer.valueOf(3)); ColumnProjectionUtils.setReadColumnIDs(conf, readCols); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); cols.resetValid(8); Object row = serDe.deserialize(cols); StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); assertEquals("Field size should be 8", 8, fieldRefs.size()); for (int i : readCols) { Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i)); Object standardWritableData = ObjectInspectorUtils.copyToStandardObject( fieldData, fieldRefs.get(i).getFieldObjectInspector(), ObjectInspectorCopyOption.WRITABLE); assertEquals("Field " + i, standardWritableData, expectedPartitalFieldsData[i]); } assertEquals( "Class of the serialized object should be BytesRefArrayWritable", BytesRefArrayWritable.class, serDe.getSerializedClass()); BytesRefArrayWritable serializedBytes = (BytesRefArrayWritable) serDe.serialize(row, oi); assertEquals("Serialized data", patialS, serializedBytes); } reader.close(); long cost = System.currentTimeMillis() - start; LOG.debug("reading fully costs:" + cost + " milliseconds"); }
public void fullyReadTest(FileSystem fs, int count, Path file) throws IOException, SerDeException { LOG.debug("reading " + count + " records"); long start = System.currentTimeMillis(); ColumnProjectionUtils.setFullyReadColumns(conf); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); int actualRead = 0; BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); cols.resetValid(8); Object row = serDe.deserialize(cols); StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); assertEquals("Field size should be 8", 8, fieldRefs.size()); for (int i = 0; i < fieldRefs.size(); i++) { Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i)); Object standardWritableData = ObjectInspectorUtils.copyToStandardObject( fieldData, fieldRefs.get(i).getFieldObjectInspector(), ObjectInspectorCopyOption.WRITABLE); assertEquals("Field " + i, standardWritableData, expectedFieldsData[i]); } // Serialize assertEquals( "Class of the serialized object should be BytesRefArrayWritable", BytesRefArrayWritable.class, serDe.getSerializedClass()); BytesRefArrayWritable serializedText = (BytesRefArrayWritable) serDe.serialize(row, oi); assertEquals("Serialized data", s, serializedText); actualRead++; } reader.close(); assertEquals("Expect " + count + " rows, actual read " + actualRead, actualRead, count); long cost = System.currentTimeMillis() - start; LOG.debug("reading fully costs:" + cost + " milliseconds"); }
public int performRCFileReadFirstAndLastColumnTest( FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException { byte[][] checkBytes = null; BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber); if (chechCorrect) { resetRandomGenerators(); checkBytes = new byte[allColumnsNumber][]; } int actualReadCount = 0; java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>(); readCols.add(Integer.valueOf(0)); readCols.add(Integer.valueOf(allColumnsNumber - 1)); ColumnProjectionUtils.appendReadColumns(conf, readCols); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); boolean ok = true; if (chechCorrect) { nextRandomRow(checkBytes, checkRow); ok = ok && (checkRow.get(0).equals(cols.get(0))); ok = ok && checkRow.get(allColumnsNumber - 1).equals(cols.get(allColumnsNumber - 1)); } if (!ok) { throw new IllegalStateException("Compare read and write error."); } actualReadCount++; } return actualReadCount; }
/** * Builds Thrift object from the raw bytes returned by RCFile reader. * * @throws TException */ @SuppressWarnings({"unchecked", "rawtypes"}) public TBase<?, ?> getCurrentThriftValue() throws IOException, InterruptedException, TException { BytesRefArrayWritable byteRefs = getCurrentBytesRefArrayWritable(); if (byteRefs == null) { return null; } TBase tObj = tDesc.newThriftObject(); for (int i = 0; i < knownRequiredFields.size(); i++) { BytesRefWritable buf = byteRefs.get(columnsBeingRead.get(i)); if (buf.getLength() > 0) { memTransport.reset(buf.getData(), buf.getStart(), buf.getLength()); Field field = knownRequiredFields.get(i); tObj.setFieldValue(field.getFieldIdEnum(), ThriftUtils.readFieldNoTag(tProto, field)); } // else no need to set default value since any default value // would have been serialized when this record was written. } // parse unknowns column if required if (readUnknownsColumn) { int last = columnsBeingRead.get(columnsBeingRead.size() - 1); BytesRefWritable buf = byteRefs.get(last); if (buf.getLength() > 0) { memTransport.reset(buf.getData(), buf.getStart(), buf.getLength()); tObj.read(tProto); } } return tObj; }
@Override public HiveReadableRecord parse(Writable value, HiveReadableRecord record) throws IOException { final BytesRefArrayWritable braw = (BytesRefArrayWritable) value; final ArrayRecord arrayRecord = (ArrayRecord) record; arrayRecord.reset(); for (int i = 0; i < columnIndexes.length; i++) { final int column = columnIndexes[i]; final BytesRefWritable fieldData = braw.unCheckedGet(column); final byte[] bytes = fieldData.getData(); final int start = fieldData.getStart(); final int length = fieldData.getLength(); if (length == "\\N".length() && bytes[start] == '\\' && bytes[start + 1] == 'N') { arrayRecord.setNull(column, true); } else { parsePrimitiveColumn(column, bytes, start, length); } } return arrayRecord; }
public void map( LongWritable key, BytesRefArrayWritable value, OutputCollector<Text, BytesRefArrayWritable> output, Reporter reporter) throws IOException { StringBuilder keyBuilder = new StringBuilder(); for (int i = 0; i < primaryKeyIndexes.length; i++) { BytesRefWritable cell = value.get(primaryKeyIndexes[i]); String cellStr = new String(cell.getData()) .substring(cell.getStart(), cell.getStart() + cell.getLength()); // We are only looking for a perfect match keyBuilder.append(cellStr + zeroChar); } keyBuilder.append(isDeltaChar); newKey.set(keyBuilder.toString()); output.collect(newKey, value); }
static { try { bytesArray = new byte[][] { "123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") }; s = new BytesRefArrayWritable(bytesArray.length); s.set(0, new BytesRefWritable("123".getBytes("UTF-8"))); s.set(1, new BytesRefWritable("456".getBytes("UTF-8"))); s.set(2, new BytesRefWritable("789".getBytes("UTF-8"))); s.set(3, new BytesRefWritable("1000".getBytes("UTF-8"))); s.set(4, new BytesRefWritable("5.3".getBytes("UTF-8"))); s.set(5, new BytesRefWritable("hive and hadoop".getBytes("UTF-8"))); s.set(6, new BytesRefWritable("NULL".getBytes("UTF-8"))); s.set(7, new BytesRefWritable("NULL".getBytes("UTF-8"))); // partial test init patialS.set(0, new BytesRefWritable("NULL".getBytes("UTF-8"))); patialS.set(1, new BytesRefWritable("NULL".getBytes("UTF-8"))); patialS.set(2, new BytesRefWritable("789".getBytes("UTF-8"))); patialS.set(3, new BytesRefWritable("1000".getBytes("UTF-8"))); patialS.set(4, new BytesRefWritable("NULL".getBytes("UTF-8"))); patialS.set(5, new BytesRefWritable("NULL".getBytes("UTF-8"))); patialS.set(6, new BytesRefWritable("NULL".getBytes("UTF-8"))); patialS.set(7, new BytesRefWritable("NULL".getBytes("UTF-8"))); } catch (UnsupportedEncodingException e) { } }
public void testSimpleReadAndWrite() throws IOException, SerDeException { fs.delete(file, true); byte[][] record_1 = { "123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") }; byte[][] record_2 = { "100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") }; RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length); RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec()); BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length); for (int i = 0; i < record_1.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length); bytes.set(i, cu); } writer.append(bytes); bytes.clear(); for (int i = 0; i < record_2.length; i++) { BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length); bytes.set(i, cu); } writer.append(bytes); writer.close(); Object[] expectedRecord_1 = { new ByteWritable((byte) 123), new ShortWritable((short) 456), new IntWritable(789), new LongWritable(1000), new DoubleWritable(5.3), new Text("hive and hadoop"), null, null }; Object[] expectedRecord_2 = { new ByteWritable((byte) 100), new ShortWritable((short) 200), new IntWritable(123), new LongWritable(1000), new DoubleWritable(5.3), new Text("hive and hadoop"), null, null }; RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); for (int i = 0; i < 2; i++) { reader.next(rowID); BytesRefArrayWritable cols = new BytesRefArrayWritable(); reader.getCurrentRow(cols); cols.resetValid(8); Object row = serDe.deserialize(cols); StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); assertEquals("Field size should be 8", 8, fieldRefs.size()); for (int j = 0; j < fieldRefs.size(); j++) { Object fieldData = oi.getStructFieldData(row, fieldRefs.get(j)); Object standardWritableData = ObjectInspectorUtils.copyToStandardObject( fieldData, fieldRefs.get(j).getFieldObjectInspector(), ObjectInspectorCopyOption.WRITABLE); if (i == 0) { assertEquals("Field " + i, standardWritableData, expectedRecord_1[j]); } else { assertEquals("Field " + i, standardWritableData, expectedRecord_2[j]); } } } reader.close(); }