@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { StringTuple tuple = new StringTuple(key.toString()); tuple.add(value.toString()); context.write(tuple, ONE); }
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Set<String> outputValues = new HashSet<String>(); for (StringTuple value : values) { outputValues.addAll(value.getEntries()); } context.write(key, new StringTuple(outputValues)); }
/** * Parallel Classification * * @param key The label * @param value the features (all unique) associated w/ this label * @param output The OutputCollector to write the results to * @param reporter Reports status back to hadoop */ @Override public void map( Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel(); try { ClassifierResult result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]), defaultCategory); String correctLabel = key.toString(); String classifiedLabel = result.getLabel(); StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE); outputTuple.add(correctLabel); outputTuple.add(classifiedLabel); output.collect(outputTuple, ONE); } catch (InvalidDatastoreException e) { throw new IOException(e); } }
@Test public void testVectorDistanceMapper() throws Exception { Mapper<WritableComparable<?>, VectorWritable, StringTuple, DoubleWritable>.Context context = EasyMock.createMock(Mapper.Context.class); StringTuple tuple; tuple = new StringTuple(); tuple.add("foo"); tuple.add("123"); context.write(tuple, new DoubleWritable(Math.sqrt(2.0))); tuple = new StringTuple(); tuple.add("foo2"); tuple.add("123"); context.write(tuple, new DoubleWritable(1)); EasyMock.replay(context); Vector vector = new RandomAccessSparseVector(2); vector.set(0, 2); vector.set(1, 2); VectorDistanceMapper mapper = new VectorDistanceMapper(); setField(mapper, "measure", new EuclideanDistanceMeasure()); List<NamedVector> seedVectors = new ArrayList<NamedVector>(); Vector seed1 = new RandomAccessSparseVector(2); seed1.set(0, 1); seed1.set(1, 1); Vector seed2 = new RandomAccessSparseVector(2); seed2.set(0, 2); seed2.set(1, 1); seedVectors.add(new NamedVector(seed1, "foo")); seedVectors.add(new NamedVector(seed2, "foo2")); setField(mapper, "seedVectors", seedVectors); mapper.map(new IntWritable(123), new VectorWritable(vector), context); EasyMock.verify(context); }
public static ConfusionMatrix readResult( FileSystem fs, Path pathPattern, Configuration conf, Parameters params) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); String defaultLabel = params.get("defaultCat"); FileStatus[] outputFiles = fs.globStatus(pathPattern); Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>(); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { String correctLabel = key.stringAt(1); String classifiedLabel = key.stringAt(2); Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel); if (rowMatrix == null) { rowMatrix = new HashMap<String, Integer>(); } Integer count = Double.valueOf(value.get()).intValue(); rowMatrix.put(classifiedLabel, count); confusionMatrix.put(correctLabel, rowMatrix); } } ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel); for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) { Map<String, Integer> rowMatrix = correctLabelSet.getValue(); for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) { matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey()); matrix.putCount( correctLabelSet.getKey(), classifiedLabelSet.getKey(), classifiedLabelSet.getValue()); } } return matrix; }