private static void extract( List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); File outFile = new File(output); if (outFile.exists()) { outFile.delete(); } outFile.createNewFile(); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile)); ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream(); DataOutput keySectionOut = new DataOutputStream(keySectionStream); SerializationFactory serializationFactory = new SerializationFactory(conf); RubixFile<Tuple, Object> lastrFile = null; JsonNode json; long totalLength = 0; final int BUF_SIZE = 32 * 1024; long blockIds[] = new long[numBlocks]; int foundBlocks = 0; for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i; for (int i = 0; i < numBlocks; i++) { boolean found = false; for (RubixFile<Tuple, Object> rfile : rfiles) { print.f("Checking %s", rfile.path.toString()); List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); for (KeyData<Tuple> keyData : keyDataList) { if (keyData.getBlockId() == blockIds[i]) { long offset = keyData.getOffset(); long length = keyData.getLength(); Tuple key = keyData.getKey(); print.f( "Extracting block %d (off=%d len=%d) from %s", keyData.getBlockId(), offset, length, rfile.path.toString()); // copy the data if (length > 0) { FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(rfile.path); in.seek(offset); byte[] data = new byte[BUF_SIZE]; long toRead = length; while (toRead > 0) { int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead; in.readFully(data, 0, thisRead); bos.write(data, 0, thisRead); toRead -= thisRead; System.out.print("."); } System.out.println(); } // copy the key section Serializer<Tuple> keySerializer = serializationFactory.getSerializer(rfile.getKeyClass()); keySerializer.open(keySectionStream); keySerializer.serialize(key); keySectionOut.writeLong(totalLength); // position keySectionOut.writeLong(keyData.getBlockId()); keySectionOut.writeLong(keyData.getNumRecords()); foundBlocks++; totalLength += length; lastrFile = rfile; found = true; break; } } if (found) { break; } } if (!found) System.err.println("Cannot locate block with id " + blockIds[i]); } byte[] trailerBytes = keySectionStream.toByteArray(); json = JsonUtils.cloneNode(lastrFile.metadataJson); ((ObjectNode) json).put("numberOfBlocks", foundBlocks); DataOutput out = new DataOutputStream(bos); out.writeUTF(json.toString()); out.writeInt(trailerBytes.length); out.write(trailerBytes); out.writeLong(totalLength); // trailer start offset bos.close(); }
private static void dumpText(List<RubixFile<Tuple, Object>> rfiles, String output, int numRows) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); int totalBlocks = 0; for (RubixFile<Tuple, Object> rfile : rfiles) { Path path = rfile.path; List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); print.f("--- %s", path.toString()); print.f("Schema: %s", rfile.getSchema().toString()); print.f("PartitionKeys: %s", Arrays.toString(rfile.getPartitionKeys())); print.f("SortKeys %s", Arrays.toString(rfile.getSortKeys())); print.f("Block Serialization Type: %s", rfile.getBlockSerializationType()); print.f("Number of blocks: %d", keyDataList.size()); totalBlocks += keyDataList.size(); int cumrows = 0; for (KeyData<Tuple> keyData : keyDataList) { print.f( "Block %s. BlockId: %d (Reducer: %d Index:%d)", keyData, keyData.blockId, (keyData.getBlockId() >> 32), (keyData.getBlockId() & (((long) 1 << 32) - 1))); if (numRows > 0) { RubixInputSplit<Tuple, Object> split = new RubixInputSplit<Tuple, Object>( conf, path, keyData.getKey(), keyData.getOffset(), keyData.getLength(), keyData.getBlockId(), keyData.getNumRecords(), rfile.getKeyClass(), rfile.getValueClass(), rfile.getSchema(), rfile.getBlockSerializationType()); RubixRecordReader<Tuple, Object> recordReader = new RubixRecordReader<Tuple, Object>(); recordReader.initialize(split, conf); int rows = 0; while (recordReader.nextKeyValue()) { rows++; if (rows < numRows) { System.out.println("\t" + recordReader.getCurrentValue()); } else { break; } } cumrows += keyData.getNumRecords(); System.out.println( String.format("\tRows=%d Cummulative=%d", keyData.getNumRecords(), cumrows)); } } } print.f("Total Blocks: %d", totalBlocks); }
@SuppressWarnings("unchecked") public List<KeyData<K>> getKeyData() throws IOException, ClassNotFoundException { if (keyData != null) return keyData; final FileSystem fs = FileSystem.get(conf); keyData = new ArrayList<KeyData<K>>(); final long filesize = fs.getFileStatus(path).getLen(); FSDataInputStream in = fs.open(path); /* The last long in the file is the start position of the trailer section */ in.seek(filesize - 8); long metaDataStartPos = in.readLong(); in.seek(metaDataStartPos); ObjectMapper mapper = new ObjectMapper(); metadataJson = mapper.readValue(in.readUTF(), JsonNode.class); int keySectionSize = in.readInt(); // load the key section byte[] keySection = new byte[keySectionSize]; in.seek(filesize - keySectionSize - 8); in.read(keySection, 0, keySectionSize); in.close(); ByteArrayInputStream bis = new ByteArrayInputStream(keySection); DataInput dataInput = new DataInputStream(bis); int numberOfBlocks = metadataJson.get("numberOfBlocks").getIntValue(); // load the key section keyClass = (Class<K>) ClassCache.forName(JsonUtils.getText(metadataJson, "keyClass")); valueClass = (Class<V>) ClassCache.forName(JsonUtils.getText(metadataJson, "valueClass")); SerializationFactory serializationFactory = new SerializationFactory(conf); Deserializer<K> deserializer = serializationFactory.getDeserializer(keyClass); deserializer.open(bis); while (bis.available() > 0 && numberOfBlocks > 0) { K key = deserializer.deserialize(null); long offset = dataInput.readLong(); long blockId = dataInput.readLong(); long numRecords = dataInput.readLong(); keyData.add(new KeyData<K>(key, offset, 0, numRecords, blockId)); numberOfBlocks--; } // Assign length to each keydata entry int numEntries = keyData.size(); for (int i = 1; i < numEntries; i++) { KeyData<K> prev = keyData.get(i - 1); KeyData<K> current = keyData.get(i); prev.setLength(current.getOffset() - prev.getOffset()); } if (numEntries > 0) { KeyData<K> last = keyData.get(numEntries - 1); last.setLength(metaDataStartPos - last.offset); } return keyData; }