コード例 #1
0
  private static void dumpText(List<RubixFile<Tuple, Object>> rfiles, String output, int numRows)
      throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
          IllegalAccessException {
    Configuration conf = new JobConf();
    int totalBlocks = 0;

    for (RubixFile<Tuple, Object> rfile : rfiles) {
      Path path = rfile.path;
      List<KeyData<Tuple>> keyDataList = rfile.getKeyData();

      print.f("--- %s", path.toString());
      print.f("Schema: %s", rfile.getSchema().toString());
      print.f("PartitionKeys: %s", Arrays.toString(rfile.getPartitionKeys()));
      print.f("SortKeys %s", Arrays.toString(rfile.getSortKeys()));
      print.f("Block Serialization Type: %s", rfile.getBlockSerializationType());
      print.f("Number of blocks: %d", keyDataList.size());

      totalBlocks += keyDataList.size();

      int cumrows = 0;

      for (KeyData<Tuple> keyData : keyDataList) {
        print.f(
            "Block %s. BlockId: %d (Reducer: %d Index:%d)",
            keyData,
            keyData.blockId,
            (keyData.getBlockId() >> 32),
            (keyData.getBlockId() & (((long) 1 << 32) - 1)));

        if (numRows > 0) {
          RubixInputSplit<Tuple, Object> split =
              new RubixInputSplit<Tuple, Object>(
                  conf,
                  path,
                  keyData.getKey(),
                  keyData.getOffset(),
                  keyData.getLength(),
                  keyData.getBlockId(),
                  keyData.getNumRecords(),
                  rfile.getKeyClass(),
                  rfile.getValueClass(),
                  rfile.getSchema(),
                  rfile.getBlockSerializationType());

          RubixRecordReader<Tuple, Object> recordReader = new RubixRecordReader<Tuple, Object>();
          recordReader.initialize(split, conf);
          int rows = 0;

          while (recordReader.nextKeyValue()) {
            rows++;
            if (rows < numRows) {
              System.out.println("\t" + recordReader.getCurrentValue());
            } else {
              break;
            }
          }

          cumrows += keyData.getNumRecords();
          System.out.println(
              String.format("\tRows=%d Cummulative=%d", keyData.getNumRecords(), cumrows));
        }
      }
    }

    print.f("Total Blocks: %d", totalBlocks);
  }
コード例 #2
0
  private static void extract(
      List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output)
      throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    Configuration conf = new JobConf();
    File outFile = new File(output);
    if (outFile.exists()) {
      outFile.delete();
    }
    outFile.createNewFile();
    BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile));
    ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream();
    DataOutput keySectionOut = new DataOutputStream(keySectionStream);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    RubixFile<Tuple, Object> lastrFile = null;
    JsonNode json;
    long totalLength = 0;

    final int BUF_SIZE = 32 * 1024;
    long blockIds[] = new long[numBlocks];
    int foundBlocks = 0;

    for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i;

    for (int i = 0; i < numBlocks; i++) {
      boolean found = false;
      for (RubixFile<Tuple, Object> rfile : rfiles) {
        print.f("Checking %s", rfile.path.toString());
        List<KeyData<Tuple>> keyDataList = rfile.getKeyData();
        for (KeyData<Tuple> keyData : keyDataList) {
          if (keyData.getBlockId() == blockIds[i]) {
            long offset = keyData.getOffset();
            long length = keyData.getLength();
            Tuple key = keyData.getKey();
            print.f(
                "Extracting block %d (off=%d len=%d) from %s",
                keyData.getBlockId(), offset, length, rfile.path.toString());

            // copy the data
            if (length > 0) {
              FileSystem fs = FileSystem.get(conf);
              FSDataInputStream in = fs.open(rfile.path);
              in.seek(offset);

              byte[] data = new byte[BUF_SIZE];
              long toRead = length;
              while (toRead > 0) {
                int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead;
                in.readFully(data, 0, thisRead);
                bos.write(data, 0, thisRead);
                toRead -= thisRead;
                System.out.print(".");
              }
              System.out.println();
            }
            // copy the key section
            Serializer<Tuple> keySerializer =
                serializationFactory.getSerializer(rfile.getKeyClass());
            keySerializer.open(keySectionStream);

            keySerializer.serialize(key);
            keySectionOut.writeLong(totalLength); // position
            keySectionOut.writeLong(keyData.getBlockId());
            keySectionOut.writeLong(keyData.getNumRecords());
            foundBlocks++;
            totalLength += length;
            lastrFile = rfile;

            found = true;
            break;
          }
        }
        if (found) {
          break;
        }
      }
      if (!found) System.err.println("Cannot locate block with id " + blockIds[i]);
    }
    byte[] trailerBytes = keySectionStream.toByteArray();

    json = JsonUtils.cloneNode(lastrFile.metadataJson);
    ((ObjectNode) json).put("numberOfBlocks", foundBlocks);

    DataOutput out = new DataOutputStream(bos);
    out.writeUTF(json.toString());
    out.writeInt(trailerBytes.length);
    out.write(trailerBytes);
    out.writeLong(totalLength); // trailer start offset
    bos.close();
  }