protected static void writeLabelBindings(Writer writer, ARFFModel arffModel, String delimiter) throws IOException { Map<String, Integer> labels = arffModel.getLabelBindings(); writer.write("Label bindings for Relation " + arffModel.getRelation() + '\n'); for (Map.Entry<String, Integer> entry : labels.entrySet()) { writer.write(entry.getKey()); writer.write(delimiter); writer.write(String.valueOf(entry.getValue())); writer.write('\n'); } writer.write('\n'); writer.write("Values for nominal attributes\n"); // emit allowed values for NOMINAL/categorical/enumerated attributes Map<String, Map<String, Integer>> nominalMap = arffModel.getNominalMap(); // how many nominal attributes writer.write(String.valueOf(nominalMap.size()) + "\n"); for (Entry<String, Map<String, Integer>> entry : nominalMap.entrySet()) { // the label of this attribute writer.write(entry.getKey() + "\n"); Set<Entry<String, Integer>> attributeValues = entry.getValue().entrySet(); // how many values does this attribute have writer.write(attributeValues.size() + "\n"); for (Map.Entry<String, Integer> value : attributeValues) { // the value and the value index writer.write( String.format("%s%s%s\n", value.getKey(), delimiter, value.getValue().toString())); } } }
protected static void writeFile( String outDir, File file, long maxDocs, ARFFModel arffModel, File dictOut, String delimiter, boolean jsonDictonary) throws IOException { log.info("Converting File: {}", file); ARFFModel model = new MapBackedARFFModel( arffModel.getWords(), arffModel.getWordCount() + 1, arffModel.getNominalMap()); Iterable<Vector> iteratable = new ARFFVectorIterable(file, model); String outFile = outDir + '/' + file.getName() + ".mvc"; VectorWriter vectorWriter = getSeqFileWriter(outFile); try { long numDocs = vectorWriter.write(iteratable, maxDocs); writeLabelBindings(dictOut, model, delimiter, jsonDictonary); log.info("Wrote: {} vectors", numDocs); } finally { Closeables.close(vectorWriter, false); } }
protected static void writeLabelBindingsJSON(Writer writer, ARFFModel arffModel) throws IOException { // Turn the map of labels into a list order by order of appearance List<Entry<String, Integer>> attributes = Lists.newArrayList(); attributes.addAll(arffModel.getLabelBindings().entrySet()); Collections.sort( attributes, new Comparator<Map.Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> t, Entry<String, Integer> t1) { return t.getValue().compareTo(t1.getValue()); } }); // write a map for each object List<Map<String, Object>> jsonObjects = Lists.newLinkedList(); for (int i = 0; i < attributes.size(); i++) { Entry<String, Integer> modelRepresentation = attributes.get(i); Map<String, Object> jsonRepresentation = Maps.newHashMap(); jsonObjects.add(jsonRepresentation); // the last one is the class label jsonRepresentation.put( "label", i < (attributes.size() - 1) ? String.valueOf(false) : String.valueOf(true)); String attribute = modelRepresentation.getKey(); jsonRepresentation.put("attribute", attribute); Map<String, Integer> nominalValues = arffModel.getNominalMap().get(attribute); if (nominalValues != null) { String[] values = nominalValues.keySet().toArray(new String[1]); jsonRepresentation.put("values", values); jsonRepresentation.put("type", "categorical"); } else { jsonRepresentation.put("type", "numerical"); } } writer.write(OBJECT_MAPPER.writeValueAsString(jsonObjects)); }