private void write(Object part, int id, Vector vector) throws IOException { SequenceFile.Writer writer = writers.get(part); if (writer == null) { Configuration conf = UDFContext.getUDFContext().getJobConf(); Path file = PathUtils.enter(getStorePath(), String.valueOf(part), "part-" + Env.getPartID()); writer = IOUtils.forSequenceWrite(conf, file, IntWritable.class, VectorWritable.class); writers.put(part, writer); } keyWritable.set(id); valueWritable.set(vector); writer.append(keyWritable, valueWritable); }
public GroupedVectorStore() throws IOException { super(new NullOutputFormat()); Env.inBackground( new Env.BackgroundProcedure() { @Override public void execute(Configuration conf) throws IOException { tvgen = VectorUtils.createVectorGenerator( Env.getProperty(GroupedVectorStore.class, "vector.type")); writers = new HashMap<Object, SequenceFile.Writer>(); keyWritable = new IntWritable(); valueWritable = new VectorWritable(); } }); }
@Override public void checkSchema(ResourceSchema schema) throws IOException { SchemaUtils.claim( schema, 0, DataType.INTEGER, DataType.CHARARRAY, DataType.UNKNOWN); // the part name (usually will be int, but generally accept any type) ResourceSchema.ResourceFieldSchema bag = SchemaUtils.claim(schema, 1, DataType.BAG); // the bag ResourceSchema.ResourceFieldSchema tuple = SchemaUtils.claim(bag, 0, DataType.TUPLE); // the tuple of (id, vector) SchemaUtils.claim(tuple, 0, DataType.INTEGER); // the id tuple = SchemaUtils.claim(tuple, 1, DataType.TUPLE); // the vector Env.setProperty( GroupedVectorStore.class, "vector.type", VectorUtils.typeOfVector(tuple.getSchema())); }