  public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json)
      throws PreconditionException {
    String inputBlockName = JsonUtils.getText(json, "input");
    PostCondition inputCondition = preConditions.get(inputBlockName);
    BlockSchema inputSchema = inputCondition.getSchema();

    Map<String, CodeDictionary> dictionaryMap = new HashMap<String, CodeDictionary>();
    if (json.has("columns")) {
      String[] columns = JsonUtils.asArray(json, "columns");
      for (String column : columns) dictionaryMap.put(column, new CodeDictionary());
    } else {
      JsonNode dictionary = json.get("dictionary");
      // this is inline dictionary
      Iterator<String> nameIterator = dictionary.getFieldNames();
      while (nameIterator.hasNext()) {
        String name = nameIterator.next();
        ArrayNode values = (ArrayNode) dictionary.get(name);
        CodeDictionary codeDictionary = new CodeDictionary();
        for (JsonNode value : values) {
        dictionaryMap.put(name, codeDictionary);

    int numColumns = inputSchema.getNumColumns();

    ColumnType[] columnTypes = new ColumnType[numColumns];

    for (int i = 0; i < columnTypes.length; i++) {
      ColumnType type;
      final String name = inputSchema.getName(i);

      if (dictionaryMap.containsKey(name)) {
        // this column is decoded. Transform schema
        type = new ColumnType(name, DataType.STRING);
      } else {
        // this column is not decoded. Reuse schema
        type = inputSchema.getColumnType(i);

      columnTypes[i] = type;

    BlockSchema schema = new BlockSchema(columnTypes);

    return new PostCondition(
        schema, inputCondition.getPartitionKeys(), inputCondition.getSortKeys());
Esempio n. 2
  public BlockSerializationType getBlockSerializationType()
      throws IOException, ClassNotFoundException {
    if (keyData == null) getKeyData();

    if (!metadataJson.has("serializationType")) return BlockSerializationType.DEFAULT;

    return BlockSerializationType.valueOf(JsonUtils.getText(metadataJson, "serializationType"));
  private ArrayNode createJsonForGenerate(Object vectorIdentifier) {
    ArrayNode outputTupleJson = JsonUtils.createArrayNode();

    // + First duplicate existing schema
    for (String s : inputBlock.getProperties().getSchema().getColumnNames()) {
      outputTupleJson.add(RewriteUtils.createProjectionExpressionNode(s, s));

    // + Add the new generated column
    JsonNode constNode;
    if (vectorIdentifier instanceof String)
      constNode = RewriteUtils.createStringConstant((String) vectorIdentifier);
    else constNode = RewriteUtils.createIntegerConstant((Integer) vectorIdentifier);

    String outColName = metaRelationName + "___" + identifierColumnName;
        JsonUtils.createObjectNode("col_name", outColName, "expression", constNode));
    return outputTupleJson;
  private Block generateVectorBlock(Tuple metaDataTuple)
      throws ExecException, IOException, InterruptedException {
    Map<String, Block> inputBlocksToCombiner = new HashMap<String, Block>();
    for (int i : coordinateColumnIndexes) {
      Object coordinate = metaDataTuple.get(i);
      Block coordBlock = createCoordinateBlock(coordinate);
      if (coordBlock == null) continue;
      inputBlocksToCombiner.put(coordinate.toString(), coordBlock);

    // No data for this vector -- proceed to next one.
    if (inputBlocksToCombiner.size() == 0) return this.next();

    if (inputBlocksToCombiner.size() != coordinateColumnIndexes.length) {
          "CollateVectorBlock: Found fewer input blocks than number of co-ordinates ");
      return this.next();

    // Combine individual blocks
    Object vectorIdentifier = metaDataTuple.get(identifierColumnIndex);
    if (!(vectorIdentifier instanceof Integer || vectorIdentifier instanceof String))
      throw new RuntimeException("Unexpected data-type for identifier column");
    Block combinedBlock = createCombinedBlock(inputBlocksToCombiner);

     * // Prepare input args for sort operator inputSorter.clear();
     * inputSorter.put("combined_block", combinedBlock);
     * // Setup sort operator object sortOp.setInput(inputSorter, jsonForSort);

    // Prepare input arguments for generator operator
    ArrayNode outputTupleJson = createJsonForGenerate(vectorIdentifier);

    JsonNode thisGenJson = JsonUtils.cloneNode(jsonForGenerate);
    ((ObjectNode) thisGenJson).put("outputTuple", outputTupleJson);

    inputGenerator.put("combined_block", combinedBlock);

    // Setup generate operator object.
    genOp = new GenerateOperator();
    genOp.setInput(inputGenerator, thisGenJson, null);

    // Return tuple operator block that contains this generate op.
    generatedBlock = new TupleOperatorBlock(genOp, null);
    // TODO: generatedBlock.setProperty("identifierColumn", vectorIdentifier);

    // System.out.println("CollateVectorBlock: finished setInput");
    return generatedBlock;
  public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props)
      throws IOException, InterruptedException {
    // Get the dictionary
    Map<String, CodeDictionary> dictionaryMap = null;
    if (json.has("path")) {
      // load the dictionary from file
      String dictionaryName = json.get("path").getTextValue();
      String dictionaryPath = FileCache.get(dictionaryName);
      dictionaryPath = dictionaryPath + "/part-r-00000.avro";
      dictionaryMap = GenerateDictionary.loadDictionary(dictionaryPath, false, null);
    } else {
      // this is inline dictionary
      JsonNode dictionary = json.get("dictionary");

      Iterator<String> nameIterator = dictionary.getFieldNames();
      dictionaryMap = new HashMap<String, CodeDictionary>();
      while (nameIterator.hasNext()) {
        String name = nameIterator.next();
        ArrayNode values = (ArrayNode) dictionary.get(name);
        CodeDictionary codeDictionary = new CodeDictionary();
        for (JsonNode value : values) {
        dictionaryMap.put(name, codeDictionary);

    dataBlock = input.values().iterator().next();
    BlockSchema inputSchema = dataBlock.getProperties().getSchema();
    numColumns = inputSchema.getNumColumns();

    decodedTuple = TupleFactory.getInstance().newTuple(numColumns);

    // create dictionary array
    dictionaries = new CodeDictionary[numColumns];

    for (int i = 0; i < numColumns; i++) {
      String colName = inputSchema.getName(i);

      if (dictionaryMap.containsKey(colName)) {
        dictionaries[i] = dictionaryMap.get(colName);
      } else {
        dictionaries[i] = null;

    if (json.has("replaceUnknownCodes")) {
      replaceUnknownCodes = JsonUtils.getText(json, "replaceUnknownCodes");
  public void setInput(Configuration conf, Map<String, Block> input, JsonNode json)
      throws IOException, InterruptedException {
    // #1. input block
    inputBlock = (RubixMemoryBlock) input.get(JsonUtils.getText(json, "inputBlock"));

    // #2. lookup column
    String lookupColumn = json.get("lookupColumn").getTextValue();
    BlockSchema inputSchema = inputBlock.getProperties().getSchema();

    coord2offsets = BlockUtils.generateColumnIndex(inputBlock, lookupColumn);

    // #3. meta data relation name
    metaRelationName = new String(JsonUtils.getText(json, "metaRelationName"));
    matchingMetaBlock = (Block) input.get(metaRelationName);
    BlockSchema metaBlockSchema = matchingMetaBlock.getProperties().getSchema();

    // #4. find indexes for coordinate column names in meta relation's schema
    String[] coordinateColumns = JsonUtils.asArray(json.get("coordinateColumns"));
    coordinateColumnIndexes = new int[coordinateColumns.length];
    int idx = 0;
    for (String s : JsonUtils.asArray(json.get("coordinateColumns")))
      coordinateColumnIndexes[idx++] = metaBlockSchema.getIndex(s);

    // #5. find index of identifier column in meta relation's schema
    identifierColumnName = new String(JsonUtils.getText(json, "identifierColumn"));
    identifierColumnIndex = metaBlockSchema.getIndex(identifierColumnName);

    // #6. combine columns
    ArrayNode combineColumns = (ArrayNode) json.get("combineColumns");

    // setup info for sort operator
     * jsonForSort = JsonUtils.cloneNode(json); ((ObjectNode)
     * jsonForSort).put("sortBy", combineColumns); sortedBlock = new
     * TupleOperatorBlock(sortOp);

    // setup info for combiner operator
    jsonForCombine = JsonUtils.createObjectNode();
    ((ObjectNode) jsonForCombine).put("pivotBy", combineColumns);
    ((ObjectNode) jsonForCombine).put("schema", inputSchema.toJson());
    combinedBlock = new TupleOperatorBlock(combineOp, null);

    // setup info for generate operator
    jsonForGenerate = JsonUtils.createObjectNode();
Esempio n. 7
  private static void extract(
      List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output)
      throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    Configuration conf = new JobConf();
    File outFile = new File(output);
    if (outFile.exists()) {
    BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile));
    ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream();
    DataOutput keySectionOut = new DataOutputStream(keySectionStream);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    RubixFile<Tuple, Object> lastrFile = null;
    JsonNode json;
    long totalLength = 0;

    final int BUF_SIZE = 32 * 1024;
    long blockIds[] = new long[numBlocks];
    int foundBlocks = 0;

    for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i;

    for (int i = 0; i < numBlocks; i++) {
      boolean found = false;
      for (RubixFile<Tuple, Object> rfile : rfiles) {
        print.f("Checking %s", rfile.path.toString());
        List<KeyData<Tuple>> keyDataList = rfile.getKeyData();
        for (KeyData<Tuple> keyData : keyDataList) {
          if (keyData.getBlockId() == blockIds[i]) {
            long offset = keyData.getOffset();
            long length = keyData.getLength();
            Tuple key = keyData.getKey();
                "Extracting block %d (off=%d len=%d) from %s",
                keyData.getBlockId(), offset, length, rfile.path.toString());

            // copy the data
            if (length > 0) {
              FileSystem fs = FileSystem.get(conf);
              FSDataInputStream in = fs.open(rfile.path);

              byte[] data = new byte[BUF_SIZE];
              long toRead = length;
              while (toRead > 0) {
                int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead;
                in.readFully(data, 0, thisRead);
                bos.write(data, 0, thisRead);
                toRead -= thisRead;
            // copy the key section
            Serializer<Tuple> keySerializer =

            keySectionOut.writeLong(totalLength); // position
            totalLength += length;
            lastrFile = rfile;

            found = true;
        if (found) {
      if (!found) System.err.println("Cannot locate block with id " + blockIds[i]);
    byte[] trailerBytes = keySectionStream.toByteArray();

    json = JsonUtils.cloneNode(lastrFile.metadataJson);
    ((ObjectNode) json).put("numberOfBlocks", foundBlocks);

    DataOutput out = new DataOutputStream(bos);
    out.writeLong(totalLength); // trailer start offset
Esempio n. 8
  public List<KeyData<K>> getKeyData() throws IOException, ClassNotFoundException {
    if (keyData != null) return keyData;

    final FileSystem fs = FileSystem.get(conf);
    keyData = new ArrayList<KeyData<K>>();

    final long filesize = fs.getFileStatus(path).getLen();
    FSDataInputStream in = fs.open(path);

    /* The last long in the file is the start position of the trailer section */
    in.seek(filesize - 8);
    long metaDataStartPos = in.readLong();


    ObjectMapper mapper = new ObjectMapper();
    metadataJson = mapper.readValue(in.readUTF(), JsonNode.class);

    int keySectionSize = in.readInt();

    // load the key section
    byte[] keySection = new byte[keySectionSize];

    in.seek(filesize - keySectionSize - 8);
    in.read(keySection, 0, keySectionSize);

    ByteArrayInputStream bis = new ByteArrayInputStream(keySection);
    DataInput dataInput = new DataInputStream(bis);

    int numberOfBlocks = metadataJson.get("numberOfBlocks").getIntValue();

    // load the key section
    keyClass = (Class<K>) ClassCache.forName(JsonUtils.getText(metadataJson, "keyClass"));
    valueClass = (Class<V>) ClassCache.forName(JsonUtils.getText(metadataJson, "valueClass"));

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Deserializer<K> deserializer = serializationFactory.getDeserializer(keyClass);


    while (bis.available() > 0 && numberOfBlocks > 0) {
      K key = deserializer.deserialize(null);

      long offset = dataInput.readLong();
      long blockId = dataInput.readLong();
      long numRecords = dataInput.readLong();

      keyData.add(new KeyData<K>(key, offset, 0, numRecords, blockId));

    // Assign length to each keydata entry
    int numEntries = keyData.size();
    for (int i = 1; i < numEntries; i++) {
      KeyData<K> prev = keyData.get(i - 1);
      KeyData<K> current = keyData.get(i);

      prev.setLength(current.getOffset() - prev.getOffset());

    if (numEntries > 0) {
      KeyData<K> last = keyData.get(numEntries - 1);
      last.setLength(metaDataStartPos - last.offset);

    return keyData;
Esempio n. 9
  public String getBlockgenId() throws IOException, ClassNotFoundException {
    if (keyData == null) getKeyData();

    if (!metadataJson.has("BlockgenId")) return null;
    return JsonUtils.getText(metadataJson, "BlockgenId");
Esempio n. 10
  public String[] getSortKeys() throws IOException, ClassNotFoundException {
    if (keyData == null) getKeyData();

    return JsonUtils.asArray(metadataJson.get("sortKeys"));