private static void dumpText(List<RubixFile<Tuple, Object>> rfiles, String output, int numRows) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); int totalBlocks = 0; for (RubixFile<Tuple, Object> rfile : rfiles) { Path path = rfile.path; List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); print.f("--- %s", path.toString()); print.f("Schema: %s", rfile.getSchema().toString()); print.f("PartitionKeys: %s", Arrays.toString(rfile.getPartitionKeys())); print.f("SortKeys %s", Arrays.toString(rfile.getSortKeys())); print.f("Block Serialization Type: %s", rfile.getBlockSerializationType()); print.f("Number of blocks: %d", keyDataList.size()); totalBlocks += keyDataList.size(); int cumrows = 0; for (KeyData<Tuple> keyData : keyDataList) { print.f( "Block %s. BlockId: %d (Reducer: %d Index:%d)", keyData, keyData.blockId, (keyData.getBlockId() >> 32), (keyData.getBlockId() & (((long) 1 << 32) - 1))); if (numRows > 0) { RubixInputSplit<Tuple, Object> split = new RubixInputSplit<Tuple, Object>( conf, path, keyData.getKey(), keyData.getOffset(), keyData.getLength(), keyData.getBlockId(), keyData.getNumRecords(), rfile.getKeyClass(), rfile.getValueClass(), rfile.getSchema(), rfile.getBlockSerializationType()); RubixRecordReader<Tuple, Object> recordReader = new RubixRecordReader<Tuple, Object>(); recordReader.initialize(split, conf); int rows = 0; while (recordReader.nextKeyValue()) { rows++; if (rows < numRows) { System.out.println("\t" + recordReader.getCurrentValue()); } else { break; } } cumrows += keyData.getNumRecords(); System.out.println( String.format("\tRows=%d Cummulative=%d", keyData.getNumRecords(), cumrows)); } } } print.f("Total Blocks: %d", totalBlocks); }
private static void extract( List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); File outFile = new File(output); if (outFile.exists()) { outFile.delete(); } outFile.createNewFile(); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile)); ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream(); DataOutput keySectionOut = new DataOutputStream(keySectionStream); SerializationFactory serializationFactory = new SerializationFactory(conf); RubixFile<Tuple, Object> lastrFile = null; JsonNode json; long totalLength = 0; final int BUF_SIZE = 32 * 1024; long blockIds[] = new long[numBlocks]; int foundBlocks = 0; for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i; for (int i = 0; i < numBlocks; i++) { boolean found = false; for (RubixFile<Tuple, Object> rfile : rfiles) { print.f("Checking %s", rfile.path.toString()); List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); for (KeyData<Tuple> keyData : keyDataList) { if (keyData.getBlockId() == blockIds[i]) { long offset = keyData.getOffset(); long length = keyData.getLength(); Tuple key = keyData.getKey(); print.f( "Extracting block %d (off=%d len=%d) from %s", keyData.getBlockId(), offset, length, rfile.path.toString()); // copy the data if (length > 0) { FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(rfile.path); in.seek(offset); byte[] data = new byte[BUF_SIZE]; long toRead = length; while (toRead > 0) { int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead; in.readFully(data, 0, thisRead); bos.write(data, 0, thisRead); toRead -= thisRead; System.out.print("."); } System.out.println(); } // copy the key section Serializer<Tuple> keySerializer = serializationFactory.getSerializer(rfile.getKeyClass()); keySerializer.open(keySectionStream); keySerializer.serialize(key); keySectionOut.writeLong(totalLength); // position keySectionOut.writeLong(keyData.getBlockId()); keySectionOut.writeLong(keyData.getNumRecords()); foundBlocks++; totalLength += length; lastrFile = rfile; found = true; break; } } if (found) { break; } } if (!found) System.err.println("Cannot locate block with id " + blockIds[i]); } byte[] trailerBytes = keySectionStream.toByteArray(); json = JsonUtils.cloneNode(lastrFile.metadataJson); ((ObjectNode) json).put("numberOfBlocks", foundBlocks); DataOutput out = new DataOutputStream(bos); out.writeUTF(json.toString()); out.writeInt(trailerBytes.length); out.write(trailerBytes); out.writeLong(totalLength); // trailer start offset bos.close(); }