private static void dumpText(List<RubixFile<Tuple, Object>> rfiles, String output, int numRows) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); int totalBlocks = 0; for (RubixFile<Tuple, Object> rfile : rfiles) { Path path = rfile.path; List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); print.f("--- %s", path.toString()); print.f("Schema: %s", rfile.getSchema().toString()); print.f("PartitionKeys: %s", Arrays.toString(rfile.getPartitionKeys())); print.f("SortKeys %s", Arrays.toString(rfile.getSortKeys())); print.f("Block Serialization Type: %s", rfile.getBlockSerializationType()); print.f("Number of blocks: %d", keyDataList.size()); totalBlocks += keyDataList.size(); int cumrows = 0; for (KeyData<Tuple> keyData : keyDataList) { print.f( "Block %s. BlockId: %d (Reducer: %d Index:%d)", keyData, keyData.blockId, (keyData.getBlockId() >> 32), (keyData.getBlockId() & (((long) 1 << 32) - 1))); if (numRows > 0) { RubixInputSplit<Tuple, Object> split = new RubixInputSplit<Tuple, Object>( conf, path, keyData.getKey(), keyData.getOffset(), keyData.getLength(), keyData.getBlockId(), keyData.getNumRecords(), rfile.getKeyClass(), rfile.getValueClass(), rfile.getSchema(), rfile.getBlockSerializationType()); RubixRecordReader<Tuple, Object> recordReader = new RubixRecordReader<Tuple, Object>(); recordReader.initialize(split, conf); int rows = 0; while (recordReader.nextKeyValue()) { rows++; if (rows < numRows) { System.out.println("\t" + recordReader.getCurrentValue()); } else { break; } } cumrows += keyData.getNumRecords(); System.out.println( String.format("\tRows=%d Cummulative=%d", keyData.getNumRecords(), cumrows)); } } } print.f("Total Blocks: %d", totalBlocks); }
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, ParseException, InstantiationException, IllegalAccessException { final int VERBOSE_NUM_ROWS = 4; Options options = new Options(); options.addOption("h", "help", false, "shows this message"); options.addOption("v", "verbose", false, "print summary and first few rows of each block"); options.addOption("m", "metadata", false, "show the metadata"); options.addOption( "d", "dump", false, "dump the contents of the rubix file. Use -f for specifying format, and -o for specifying " + "output location"); options.addOption( "f", "format", true, "the data format for dumping data (AVRO or TEXT). Default: TEXT"); options.addOption( "e", "extract", true, "Extract one or more rubix blocks starting from the given blockId. Use -e blockId,numBlocks " + "for specifying the blocks to be extracted. Use -o for specifying output location"); options.addOption("o", true, "Store the output at the specified location"); CommandLineParser parser = new BasicParser(); // parse the command line arguments CommandLine line = parser.parse(options, args); // show the help message if (line.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "RubixFile <rubix file or dir> [options]\nIf no options are provided, print a summary of the blocks.", options); return; } // validate provided options if (line.hasOption("d") && line.hasOption("e")) { System.err.println("Cannot dump (-d) and extract (-e) at the same time!"); return; } // obtain the list of rubix files String[] files = line.getArgs(); if (files == null || files.length == 0) { System.err.println("Rubix file not specified"); return; } Configuration conf = new JobConf(); FileSystem fs = FileSystem.get(conf); Path path = new Path(files[0]); FileStatus[] allFiles; FileStatus status = fs.getFileStatus(path); if (status.isDir()) { allFiles = fs.listStatus( path, new PathFilter() { @Override public boolean accept(Path path) { return path.toString().contains(RubixConstants.RUBIX_EXTENSION); } }); } else { allFiles = new FileStatus[] {status}; } // walk over all files and extract the trailer section List<RubixFile<Tuple, Object>> rfiles = new ArrayList<RubixFile<Tuple, Object>>(); for (FileStatus s : allFiles) { Path p = s.getPath(); RubixFile<Tuple, Object> rfile = new RubixFile<Tuple, Object>(conf, p); // if printing meta data information.. exit after first file (since all files // have the same meta data) if (line.hasOption("m")) { rfile.getKeyData(); System.out.println(new ObjectMapper().writer().writeValueAsString(rfile.metadataJson)); break; } rfiles.add(rfile); } // dump the data if (line.hasOption("d")) { String format = line.getOptionValue("f"); if (format == null) format = "TEXT"; format = format.trim().toUpperCase(); if (format.equals("AVRO")) { // dumpAvro(rfiles, line.getOptionValue("o")); throw new UnsupportedOperationException( "Dumping to avro is not currently supporting. Please write a Cubert (map-only) script to store data in avro format"); } else if (format.equals("TEXT")) { if (line.hasOption("o")) { System.err.println("Dumping TEXT format data *into a file* is not currently supported"); return; } dumpText(rfiles, line.getOptionValue("o"), Integer.MAX_VALUE); } else { System.err.println("Invalid format [" + format + "] for dumping. Please use AVRO or TEXT"); return; } } // extract arguments: -e blockId,numBlocks(contiguous) -o ouputLocation else if (line.hasOption("e")) { String extractArguments = line.getOptionValue("e"); String outputLocation; if (line.hasOption("o")) { outputLocation = line.getOptionValue("o"); } else { System.err.println("Need to specify the location to store the output"); return; } long blockId; int numBlocks = 1; if (extractArguments.contains(",")) { String[] splitExtractArgs = extractArguments.split(","); blockId = Long.parseLong(splitExtractArgs[0]); numBlocks = Integer.parseInt(splitExtractArgs[1]); } else { blockId = Long.parseLong(extractArguments); } extract(rfiles, blockId, numBlocks, outputLocation); } else // print summary { dumpText(rfiles, null, line.hasOption("v") ? VERBOSE_NUM_ROWS : 0); } }
private static void extract( List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); File outFile = new File(output); if (outFile.exists()) { outFile.delete(); } outFile.createNewFile(); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile)); ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream(); DataOutput keySectionOut = new DataOutputStream(keySectionStream); SerializationFactory serializationFactory = new SerializationFactory(conf); RubixFile<Tuple, Object> lastrFile = null; JsonNode json; long totalLength = 0; final int BUF_SIZE = 32 * 1024; long blockIds[] = new long[numBlocks]; int foundBlocks = 0; for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i; for (int i = 0; i < numBlocks; i++) { boolean found = false; for (RubixFile<Tuple, Object> rfile : rfiles) { print.f("Checking %s", rfile.path.toString()); List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); for (KeyData<Tuple> keyData : keyDataList) { if (keyData.getBlockId() == blockIds[i]) { long offset = keyData.getOffset(); long length = keyData.getLength(); Tuple key = keyData.getKey(); print.f( "Extracting block %d (off=%d len=%d) from %s", keyData.getBlockId(), offset, length, rfile.path.toString()); // copy the data if (length > 0) { FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(rfile.path); in.seek(offset); byte[] data = new byte[BUF_SIZE]; long toRead = length; while (toRead > 0) { int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead; in.readFully(data, 0, thisRead); bos.write(data, 0, thisRead); toRead -= thisRead; System.out.print("."); } System.out.println(); } // copy the key section Serializer<Tuple> keySerializer = serializationFactory.getSerializer(rfile.getKeyClass()); keySerializer.open(keySectionStream); keySerializer.serialize(key); keySectionOut.writeLong(totalLength); // position keySectionOut.writeLong(keyData.getBlockId()); keySectionOut.writeLong(keyData.getNumRecords()); foundBlocks++; totalLength += length; lastrFile = rfile; found = true; break; } } if (found) { break; } } if (!found) System.err.println("Cannot locate block with id " + blockIds[i]); } byte[] trailerBytes = keySectionStream.toByteArray(); json = JsonUtils.cloneNode(lastrFile.metadataJson); ((ObjectNode) json).put("numberOfBlocks", foundBlocks); DataOutput out = new DataOutputStream(bos); out.writeUTF(json.toString()); out.writeInt(trailerBytes.length); out.write(trailerBytes); out.writeLong(totalLength); // trailer start offset bos.close(); }