public static void mapPoint( IntWritable clusterId, WeightedVectorWritable point, DistanceMeasure measure, Map<Integer, List<VectorWritable>> representativePoints, Map<Integer, WeightedVectorWritable> mostDistantPoints) { int key = clusterId.get(); WeightedVectorWritable currentMDP = mostDistantPoints.get(key); List<VectorWritable> repPoints = representativePoints.get(key); double totalDistance = 0.0; for (VectorWritable refPoint : repPoints) { totalDistance += measure.distance(refPoint.get(), point.getVector()); } if (currentMDP == null || currentMDP.getWeight() < totalDistance) { mostDistantPoints.put( key, new WeightedVectorWritable(totalDistance, point.getVector().clone())); } }
@Override protected void reduce(IntWritable key, Iterable<WeightedVectorWritable> values, Context context) throws IOException, InterruptedException { // find the most distant point WeightedVectorWritable mdp = null; for (WeightedVectorWritable dpw : values) { if (mdp == null || mdp.getWeight() < dpw.getWeight()) { mdp = new WeightedVectorWritable(dpw.getWeight(), dpw.getVector()); } } context.write(new IntWritable(key.get()), new VectorWritable(mdp.getVector())); }
@Override public void map(IntWritable key, WeightedVectorWritable value, Context context) throws IOException, InterruptedException { String name = ""; Vector v = value.getVector(); if (v instanceof NamedVector) { name = ((NamedVector) v).getName(); } JSONObject object = new JSONObject(); try { object.put("a", key.get()); object.put("fP", name); context.write(NullWritable.get(), new Text(object.toString())); } catch (JSONException e) { LOG.error("Error while creating JSON record.", e); } }
public void printClusters(String[] dictionary) throws IOException, InstantiationException, IllegalAccessException { Configuration conf = new Configuration(); if (this.termDictionary != null) { if ("text".equals(dictionaryFormat)) { dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary)); } else if ("sequencefile".equals(dictionaryFormat)) { FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf); dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary); } else { throw new IllegalArgumentException("Invalid dictionary format"); } } Writer writer = this.outputFile == null ? new OutputStreamWriter(System.out) : new FileWriter(this.outputFile); try { FileSystem fs = seqFileDir.getFileSystem(conf); for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*"))) { Path path = seqFile.getPath(); // System.out.println("Input Path: " + path); doesn't this interfere with output? SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); try { Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance(); Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance(); while (reader.next(key, value)) { Cluster cluster = (Cluster) value; String fmtStr = useJSON ? cluster.asJsonString() : cluster.asFormatString(dictionary); if (subString > 0 && fmtStr.length() > subString) { writer.write(':'); writer.write(fmtStr, 0, Math.min(subString, fmtStr.length())); } else { writer.write(fmtStr); } writer.write('\n'); if (dictionary != null) { String topTerms = getTopFeatures(cluster.getCenter(), dictionary, numTopFeatures); writer.write("\tTop Terms: "); writer.write(topTerms); writer.write('\n'); } List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId()); if (points != null) { writer.write("\tWeight: Point:\n\t"); for (Iterator<WeightedVectorWritable> iterator = points.iterator(); iterator.hasNext(); ) { WeightedVectorWritable point = iterator.next(); writer.write(String.valueOf(point.getWeight())); writer.write(": "); writer.write(AbstractCluster.formatVector(point.getVector(), dictionary)); if (iterator.hasNext()) { writer.write("\n\t"); } } writer.write('\n'); } } } finally { reader.close(); } } } finally { writer.close(); } }