@Override
 protected void map(Text key, Text value, Context context)
     throws IOException, InterruptedException {
   StringTuple tuple = new StringTuple(key.toString());
   tuple.add(value.toString());
   context.write(tuple, ONE);
 }
 @Override
 protected void reduce(Text key, Iterable<StringTuple> values, Context context)
     throws IOException, InterruptedException {
   Set<String> outputValues = new HashSet<String>();
   for (StringTuple value : values) {
     outputValues.addAll(value.getEntries());
   }
   context.write(key, new StringTuple(outputValues));
 }
  /**
   * Parallel Classification
   *
   * @param key The label
   * @param value the features (all unique) associated w/ this label
   * @param output The OutputCollector to write the results to
   * @param reporter Reports status back to hadoop
   */
  @Override
  public void map(
      Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
      throws IOException {
    List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();

    try {
      ClassifierResult result =
          classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]), defaultCategory);

      String correctLabel = key.toString();
      String classifiedLabel = result.getLabel();

      StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE);
      outputTuple.add(correctLabel);
      outputTuple.add(classifiedLabel);

      output.collect(outputTuple, ONE);
    } catch (InvalidDatastoreException e) {
      throw new IOException(e);
    }
  }
  @Test
  public void testVectorDistanceMapper() throws Exception {
    Mapper<WritableComparable<?>, VectorWritable, StringTuple, DoubleWritable>.Context context =
        EasyMock.createMock(Mapper.Context.class);
    StringTuple tuple;
    tuple = new StringTuple();
    tuple.add("foo");
    tuple.add("123");
    context.write(tuple, new DoubleWritable(Math.sqrt(2.0)));
    tuple = new StringTuple();
    tuple.add("foo2");
    tuple.add("123");
    context.write(tuple, new DoubleWritable(1));

    EasyMock.replay(context);

    Vector vector = new RandomAccessSparseVector(2);
    vector.set(0, 2);
    vector.set(1, 2);

    VectorDistanceMapper mapper = new VectorDistanceMapper();
    setField(mapper, "measure", new EuclideanDistanceMeasure());
    List<NamedVector> seedVectors = new ArrayList<NamedVector>();
    Vector seed1 = new RandomAccessSparseVector(2);
    seed1.set(0, 1);
    seed1.set(1, 1);
    Vector seed2 = new RandomAccessSparseVector(2);
    seed2.set(0, 2);
    seed2.set(1, 1);

    seedVectors.add(new NamedVector(seed1, "foo"));
    seedVectors.add(new NamedVector(seed2, "foo2"));
    setField(mapper, "seedVectors", seedVectors);

    mapper.map(new IntWritable(123), new VectorWritable(vector), context);

    EasyMock.verify(context);
  }
Пример #5
0
  public static ConfusionMatrix readResult(
      FileSystem fs, Path pathPattern, Configuration conf, Parameters params) throws IOException {

    StringTuple key = new StringTuple();
    DoubleWritable value = new DoubleWritable();
    String defaultLabel = params.get("defaultCat");
    FileStatus[] outputFiles = fs.globStatus(pathPattern);
    Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>();

    for (FileStatus fileStatus : outputFiles) {
      Path path = fileStatus.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
      while (reader.next(key, value)) {
        String correctLabel = key.stringAt(1);
        String classifiedLabel = key.stringAt(2);
        Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel);
        if (rowMatrix == null) {
          rowMatrix = new HashMap<String, Integer>();
        }
        Integer count = Double.valueOf(value.get()).intValue();
        rowMatrix.put(classifiedLabel, count);
        confusionMatrix.put(correctLabel, rowMatrix);
      }
    }

    ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel);
    for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) {
      Map<String, Integer> rowMatrix = correctLabelSet.getValue();
      for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) {
        matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey());
        matrix.putCount(
            correctLabelSet.getKey(), classifiedLabelSet.getKey(), classifiedLabelSet.getValue());
      }
    }
    return matrix;
  }