public static void mapPoint(
      IntWritable clusterId,
      WeightedVectorWritable point,
      DistanceMeasure measure,
      Map<Integer, List<VectorWritable>> representativePoints,
      Map<Integer, WeightedVectorWritable> mostDistantPoints) {
    int key = clusterId.get();
    WeightedVectorWritable currentMDP = mostDistantPoints.get(key);

    List<VectorWritable> repPoints = representativePoints.get(key);
    double totalDistance = 0.0;
    for (VectorWritable refPoint : repPoints) {
      totalDistance += measure.distance(refPoint.get(), point.getVector());
    }
    if (currentMDP == null || currentMDP.getWeight() < totalDistance) {
      mostDistantPoints.put(
          key, new WeightedVectorWritable(totalDistance, point.getVector().clone()));
    }
  }
 @Override
 protected void reduce(IntWritable key, Iterable<WeightedVectorWritable> values, Context context)
     throws IOException, InterruptedException {
   // find the most distant point
   WeightedVectorWritable mdp = null;
   for (WeightedVectorWritable dpw : values) {
     if (mdp == null || mdp.getWeight() < dpw.getWeight()) {
       mdp = new WeightedVectorWritable(dpw.getWeight(), dpw.getVector());
     }
   }
   context.write(new IntWritable(key.get()), new VectorWritable(mdp.getVector()));
 }
  @Override
  public void map(IntWritable key, WeightedVectorWritable value, Context context)
      throws IOException, InterruptedException {
    String name = "";
    Vector v = value.getVector();
    if (v instanceof NamedVector) {
      name = ((NamedVector) v).getName();
    }

    JSONObject object = new JSONObject();
    try {
      object.put("a", key.get());
      object.put("fP", name);
      context.write(NullWritable.get(), new Text(object.toString()));
    } catch (JSONException e) {
      LOG.error("Error while creating JSON record.", e);
    }
  }
Example #4
0
  public void printClusters(String[] dictionary)
      throws IOException, InstantiationException, IllegalAccessException {
    Configuration conf = new Configuration();

    if (this.termDictionary != null) {
      if ("text".equals(dictionaryFormat)) {
        dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
      } else if ("sequencefile".equals(dictionaryFormat)) {
        FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf);
        dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary);
      } else {
        throw new IllegalArgumentException("Invalid dictionary format");
      }
    }

    Writer writer =
        this.outputFile == null
            ? new OutputStreamWriter(System.out)
            : new FileWriter(this.outputFile);
    try {
      FileSystem fs = seqFileDir.getFileSystem(conf);
      for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*"))) {
        Path path = seqFile.getPath();
        // System.out.println("Input Path: " + path); doesn't this interfere with output?
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        try {
          Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
          Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
          while (reader.next(key, value)) {
            Cluster cluster = (Cluster) value;
            String fmtStr = useJSON ? cluster.asJsonString() : cluster.asFormatString(dictionary);
            if (subString > 0 && fmtStr.length() > subString) {
              writer.write(':');
              writer.write(fmtStr, 0, Math.min(subString, fmtStr.length()));
            } else {
              writer.write(fmtStr);
            }

            writer.write('\n');

            if (dictionary != null) {
              String topTerms = getTopFeatures(cluster.getCenter(), dictionary, numTopFeatures);
              writer.write("\tTop Terms: ");
              writer.write(topTerms);
              writer.write('\n');
            }

            List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId());
            if (points != null) {
              writer.write("\tWeight:  Point:\n\t");
              for (Iterator<WeightedVectorWritable> iterator = points.iterator();
                  iterator.hasNext(); ) {
                WeightedVectorWritable point = iterator.next();
                writer.write(String.valueOf(point.getWeight()));
                writer.write(": ");
                writer.write(AbstractCluster.formatVector(point.getVector(), dictionary));
                if (iterator.hasNext()) {
                  writer.write("\n\t");
                }
              }
              writer.write('\n');
            }
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      writer.close();
    }
  }