@Override public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { String inputBlockName = JsonUtils.getText(json, "input"); PostCondition inputCondition = preConditions.get(inputBlockName); BlockSchema inputSchema = inputCondition.getSchema(); Map<String, CodeDictionary> dictionaryMap = new HashMap<String, CodeDictionary>(); if (json.has("columns")) { String[] columns = JsonUtils.asArray(json, "columns"); for (String column : columns) dictionaryMap.put(column, new CodeDictionary()); } else { JsonNode dictionary = json.get("dictionary"); // this is inline dictionary Iterator<String> nameIterator = dictionary.getFieldNames(); while (nameIterator.hasNext()) { String name = nameIterator.next(); ArrayNode values = (ArrayNode) dictionary.get(name); CodeDictionary codeDictionary = new CodeDictionary(); for (JsonNode value : values) { codeDictionary.addKey(value.getTextValue()); } dictionaryMap.put(name, codeDictionary); } } int numColumns = inputSchema.getNumColumns(); ColumnType[] columnTypes = new ColumnType[numColumns]; for (int i = 0; i < columnTypes.length; i++) { ColumnType type; final String name = inputSchema.getName(i); if (dictionaryMap.containsKey(name)) { // this column is decoded. Transform schema type = new ColumnType(name, DataType.STRING); } else { // this column is not decoded. Reuse schema type = inputSchema.getColumnType(i); } columnTypes[i] = type; } BlockSchema schema = new BlockSchema(columnTypes); return new PostCondition( schema, inputCondition.getPartitionKeys(), inputCondition.getSortKeys()); }
public BlockSerializationType getBlockSerializationType() throws IOException, ClassNotFoundException { if (keyData == null) getKeyData(); if (!metadataJson.has("serializationType")) return BlockSerializationType.DEFAULT; return BlockSerializationType.valueOf(JsonUtils.getText(metadataJson, "serializationType")); }
private ArrayNode createJsonForGenerate(Object vectorIdentifier) { ArrayNode outputTupleJson = JsonUtils.createArrayNode(); // + First duplicate existing schema for (String s : inputBlock.getProperties().getSchema().getColumnNames()) { outputTupleJson.add(RewriteUtils.createProjectionExpressionNode(s, s)); } // + Add the new generated column JsonNode constNode; if (vectorIdentifier instanceof String) constNode = RewriteUtils.createStringConstant((String) vectorIdentifier); else constNode = RewriteUtils.createIntegerConstant((Integer) vectorIdentifier); String outColName = metaRelationName + "___" + identifierColumnName; outputTupleJson.add( JsonUtils.createObjectNode("col_name", outColName, "expression", constNode)); return outputTupleJson; }
private Block generateVectorBlock(Tuple metaDataTuple) throws ExecException, IOException, InterruptedException { Map<String, Block> inputBlocksToCombiner = new HashMap<String, Block>(); for (int i : coordinateColumnIndexes) { Object coordinate = metaDataTuple.get(i); Block coordBlock = createCoordinateBlock(coordinate); if (coordBlock == null) continue; inputBlocksToCombiner.put(coordinate.toString(), coordBlock); } // No data for this vector -- proceed to next one. if (inputBlocksToCombiner.size() == 0) return this.next(); if (inputBlocksToCombiner.size() != coordinateColumnIndexes.length) { System.out.println( "CollateVectorBlock: Found fewer input blocks than number of co-ordinates "); return this.next(); } // Combine individual blocks Object vectorIdentifier = metaDataTuple.get(identifierColumnIndex); if (!(vectorIdentifier instanceof Integer || vectorIdentifier instanceof String)) throw new RuntimeException("Unexpected data-type for identifier column"); Block combinedBlock = createCombinedBlock(inputBlocksToCombiner); /* * // Prepare input args for sort operator inputSorter.clear(); * inputSorter.put("combined_block", combinedBlock); * * // Setup sort operator object sortOp.setInput(inputSorter, jsonForSort); */ // Prepare input arguments for generator operator ArrayNode outputTupleJson = createJsonForGenerate(vectorIdentifier); JsonNode thisGenJson = JsonUtils.cloneNode(jsonForGenerate); ((ObjectNode) thisGenJson).put("outputTuple", outputTupleJson); inputGenerator.clear(); inputGenerator.put("combined_block", combinedBlock); // Setup generate operator object. genOp = new GenerateOperator(); genOp.setInput(inputGenerator, thisGenJson, null); // Return tuple operator block that contains this generate op. generatedBlock = new TupleOperatorBlock(genOp, null); // TODO: generatedBlock.setProperty("identifierColumn", vectorIdentifier); // System.out.println("CollateVectorBlock: finished setInput"); return generatedBlock; }
@Override public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException, InterruptedException { // Get the dictionary Map<String, CodeDictionary> dictionaryMap = null; if (json.has("path")) { // load the dictionary from file String dictionaryName = json.get("path").getTextValue(); String dictionaryPath = FileCache.get(dictionaryName); dictionaryPath = dictionaryPath + "/part-r-00000.avro"; dictionaryMap = GenerateDictionary.loadDictionary(dictionaryPath, false, null); } else { // this is inline dictionary JsonNode dictionary = json.get("dictionary"); Iterator<String> nameIterator = dictionary.getFieldNames(); dictionaryMap = new HashMap<String, CodeDictionary>(); while (nameIterator.hasNext()) { String name = nameIterator.next(); ArrayNode values = (ArrayNode) dictionary.get(name); CodeDictionary codeDictionary = new CodeDictionary(); for (JsonNode value : values) { codeDictionary.addKey(value.getTextValue()); } dictionaryMap.put(name, codeDictionary); } } dataBlock = input.values().iterator().next(); BlockSchema inputSchema = dataBlock.getProperties().getSchema(); numColumns = inputSchema.getNumColumns(); decodedTuple = TupleFactory.getInstance().newTuple(numColumns); // create dictionary array dictionaries = new CodeDictionary[numColumns]; for (int i = 0; i < numColumns; i++) { String colName = inputSchema.getName(i); if (dictionaryMap.containsKey(colName)) { dictionaries[i] = dictionaryMap.get(colName); } else { dictionaries[i] = null; } } if (json.has("replaceUnknownCodes")) { replaceUnknownCodes = JsonUtils.getText(json, "replaceUnknownCodes"); } }
@Override public void setInput(Configuration conf, Map<String, Block> input, JsonNode json) throws IOException, InterruptedException { // #1. input block inputBlock = (RubixMemoryBlock) input.get(JsonUtils.getText(json, "inputBlock")); // #2. lookup column String lookupColumn = json.get("lookupColumn").getTextValue(); BlockSchema inputSchema = inputBlock.getProperties().getSchema(); coord2offsets = BlockUtils.generateColumnIndex(inputBlock, lookupColumn); // #3. meta data relation name metaRelationName = new String(JsonUtils.getText(json, "metaRelationName")); matchingMetaBlock = (Block) input.get(metaRelationName); BlockSchema metaBlockSchema = matchingMetaBlock.getProperties().getSchema(); // #4. find indexes for coordinate column names in meta relation's schema String[] coordinateColumns = JsonUtils.asArray(json.get("coordinateColumns")); coordinateColumnIndexes = new int[coordinateColumns.length]; int idx = 0; for (String s : JsonUtils.asArray(json.get("coordinateColumns"))) coordinateColumnIndexes[idx++] = metaBlockSchema.getIndex(s); // #5. find index of identifier column in meta relation's schema identifierColumnName = new String(JsonUtils.getText(json, "identifierColumn")); identifierColumnIndex = metaBlockSchema.getIndex(identifierColumnName); // #6. combine columns ArrayNode combineColumns = (ArrayNode) json.get("combineColumns"); // setup info for sort operator /* * jsonForSort = JsonUtils.cloneNode(json); ((ObjectNode) * jsonForSort).put("sortBy", combineColumns); sortedBlock = new * TupleOperatorBlock(sortOp); */ // setup info for combiner operator jsonForCombine = JsonUtils.createObjectNode(); ((ObjectNode) jsonForCombine).put("pivotBy", combineColumns); ((ObjectNode) jsonForCombine).put("schema", inputSchema.toJson()); combinedBlock = new TupleOperatorBlock(combineOp, null); // setup info for generate operator jsonForGenerate = JsonUtils.createObjectNode(); }
private static void extract( List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); File outFile = new File(output); if (outFile.exists()) { outFile.delete(); } outFile.createNewFile(); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile)); ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream(); DataOutput keySectionOut = new DataOutputStream(keySectionStream); SerializationFactory serializationFactory = new SerializationFactory(conf); RubixFile<Tuple, Object> lastrFile = null; JsonNode json; long totalLength = 0; final int BUF_SIZE = 32 * 1024; long blockIds[] = new long[numBlocks]; int foundBlocks = 0; for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i; for (int i = 0; i < numBlocks; i++) { boolean found = false; for (RubixFile<Tuple, Object> rfile : rfiles) { print.f("Checking %s", rfile.path.toString()); List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); for (KeyData<Tuple> keyData : keyDataList) { if (keyData.getBlockId() == blockIds[i]) { long offset = keyData.getOffset(); long length = keyData.getLength(); Tuple key = keyData.getKey(); print.f( "Extracting block %d (off=%d len=%d) from %s", keyData.getBlockId(), offset, length, rfile.path.toString()); // copy the data if (length > 0) { FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(rfile.path); in.seek(offset); byte[] data = new byte[BUF_SIZE]; long toRead = length; while (toRead > 0) { int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead; in.readFully(data, 0, thisRead); bos.write(data, 0, thisRead); toRead -= thisRead; System.out.print("."); } System.out.println(); } // copy the key section Serializer<Tuple> keySerializer = serializationFactory.getSerializer(rfile.getKeyClass()); keySerializer.open(keySectionStream); keySerializer.serialize(key); keySectionOut.writeLong(totalLength); // position keySectionOut.writeLong(keyData.getBlockId()); keySectionOut.writeLong(keyData.getNumRecords()); foundBlocks++; totalLength += length; lastrFile = rfile; found = true; break; } } if (found) { break; } } if (!found) System.err.println("Cannot locate block with id " + blockIds[i]); } byte[] trailerBytes = keySectionStream.toByteArray(); json = JsonUtils.cloneNode(lastrFile.metadataJson); ((ObjectNode) json).put("numberOfBlocks", foundBlocks); DataOutput out = new DataOutputStream(bos); out.writeUTF(json.toString()); out.writeInt(trailerBytes.length); out.write(trailerBytes); out.writeLong(totalLength); // trailer start offset bos.close(); }
@SuppressWarnings("unchecked") public List<KeyData<K>> getKeyData() throws IOException, ClassNotFoundException { if (keyData != null) return keyData; final FileSystem fs = FileSystem.get(conf); keyData = new ArrayList<KeyData<K>>(); final long filesize = fs.getFileStatus(path).getLen(); FSDataInputStream in = fs.open(path); /* The last long in the file is the start position of the trailer section */ in.seek(filesize - 8); long metaDataStartPos = in.readLong(); in.seek(metaDataStartPos); ObjectMapper mapper = new ObjectMapper(); metadataJson = mapper.readValue(in.readUTF(), JsonNode.class); int keySectionSize = in.readInt(); // load the key section byte[] keySection = new byte[keySectionSize]; in.seek(filesize - keySectionSize - 8); in.read(keySection, 0, keySectionSize); in.close(); ByteArrayInputStream bis = new ByteArrayInputStream(keySection); DataInput dataInput = new DataInputStream(bis); int numberOfBlocks = metadataJson.get("numberOfBlocks").getIntValue(); // load the key section keyClass = (Class<K>) ClassCache.forName(JsonUtils.getText(metadataJson, "keyClass")); valueClass = (Class<V>) ClassCache.forName(JsonUtils.getText(metadataJson, "valueClass")); SerializationFactory serializationFactory = new SerializationFactory(conf); Deserializer<K> deserializer = serializationFactory.getDeserializer(keyClass); deserializer.open(bis); while (bis.available() > 0 && numberOfBlocks > 0) { K key = deserializer.deserialize(null); long offset = dataInput.readLong(); long blockId = dataInput.readLong(); long numRecords = dataInput.readLong(); keyData.add(new KeyData<K>(key, offset, 0, numRecords, blockId)); numberOfBlocks--; } // Assign length to each keydata entry int numEntries = keyData.size(); for (int i = 1; i < numEntries; i++) { KeyData<K> prev = keyData.get(i - 1); KeyData<K> current = keyData.get(i); prev.setLength(current.getOffset() - prev.getOffset()); } if (numEntries > 0) { KeyData<K> last = keyData.get(numEntries - 1); last.setLength(metaDataStartPos - last.offset); } return keyData; }
public String getBlockgenId() throws IOException, ClassNotFoundException { if (keyData == null) getKeyData(); if (!metadataJson.has("BlockgenId")) return null; return JsonUtils.getText(metadataJson, "BlockgenId"); }
public String[] getSortKeys() throws IOException, ClassNotFoundException { if (keyData == null) getKeyData(); return JsonUtils.asArray(metadataJson.get("sortKeys")); }