Пример #1
0
 @Test
 public void testGaussianClusterClassification() {
   ClusterClassifier classifier = newGaussianClassifier();
   Vector pdf = classifier.classify(new DenseVector(2));
   assertEquals("[0,0]", "[0.212, 0.576, 0.212]", AbstractCluster.formatVector(pdf, null));
   pdf = classifier.classify(new DenseVector(2).assign(2));
   assertEquals("[2,2]", "[0.952, 0.047, 0.000]", AbstractCluster.formatVector(pdf, null));
 }
Пример #2
0
 @Test
 public void testClusterClassification() {
   ClusterClassifier classifier = newClusterClassifier();
   Vector pdf = classifier.classify(new DenseVector(2));
   assertEquals("[0,0]", "[0.107, 0.787, 0.107]", AbstractCluster.formatVector(pdf, null));
   pdf = classifier.classify(new DenseVector(2).assign(2));
   assertEquals("[2,2]", "[0.867, 0.117, 0.016]", AbstractCluster.formatVector(pdf, null));
 }
Пример #3
0
 @Test
 public void testSoftClusterClassification() {
   ClusterClassifier classifier = newSoftClusterClassifier();
   Vector pdf = classifier.classify(new DenseVector(2));
   assertEquals("[0,0]", "[0.000, 1.000, 0.000]", AbstractCluster.formatVector(pdf, null));
   pdf = classifier.classify(new DenseVector(2).assign(2));
   assertEquals("[2,2]", "[0.735, 0.184, 0.082]", AbstractCluster.formatVector(pdf, null));
 }
Пример #4
0
 @Test
 public void testCanopyClassification() {
   List<Cluster> models = Lists.newArrayList();
   DistanceMeasure measure = new ManhattanDistanceMeasure();
   models.add(new Canopy(new DenseVector(2).assign(1), 0, measure));
   models.add(new Canopy(new DenseVector(2), 1, measure));
   models.add(new Canopy(new DenseVector(2).assign(-1), 2, measure));
   ClusterClassifier classifier = new ClusterClassifier(models);
   Vector pdf = classifier.classify(new DenseVector(2));
   assertEquals("[0,0]", "[0.107, 0.787, 0.107]", AbstractCluster.formatVector(pdf, null));
   pdf = classifier.classify(new DenseVector(2).assign(2));
   assertEquals("[2,2]", "[0.867, 0.117, 0.016]", AbstractCluster.formatVector(pdf, null));
 }
Пример #5
0
  public void printClusters(String[] dictionary)
      throws IOException, InstantiationException, IllegalAccessException {
    Configuration conf = new Configuration();

    if (this.termDictionary != null) {
      if ("text".equals(dictionaryFormat)) {
        dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
      } else if ("sequencefile".equals(dictionaryFormat)) {
        FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf);
        dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary);
      } else {
        throw new IllegalArgumentException("Invalid dictionary format");
      }
    }

    Writer writer =
        this.outputFile == null
            ? new OutputStreamWriter(System.out)
            : new FileWriter(this.outputFile);
    try {
      FileSystem fs = seqFileDir.getFileSystem(conf);
      for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*"))) {
        Path path = seqFile.getPath();
        // System.out.println("Input Path: " + path); doesn't this interfere with output?
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        try {
          Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
          Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
          while (reader.next(key, value)) {
            Cluster cluster = (Cluster) value;
            String fmtStr = useJSON ? cluster.asJsonString() : cluster.asFormatString(dictionary);
            if (subString > 0 && fmtStr.length() > subString) {
              writer.write(':');
              writer.write(fmtStr, 0, Math.min(subString, fmtStr.length()));
            } else {
              writer.write(fmtStr);
            }

            writer.write('\n');

            if (dictionary != null) {
              String topTerms = getTopFeatures(cluster.getCenter(), dictionary, numTopFeatures);
              writer.write("\tTop Terms: ");
              writer.write(topTerms);
              writer.write('\n');
            }

            List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId());
            if (points != null) {
              writer.write("\tWeight:  Point:\n\t");
              for (Iterator<WeightedVectorWritable> iterator = points.iterator();
                  iterator.hasNext(); ) {
                WeightedVectorWritable point = iterator.next();
                writer.write(String.valueOf(point.getWeight()));
                writer.write(": ");
                writer.write(AbstractCluster.formatVector(point.getVector(), dictionary));
                if (iterator.hasNext()) {
                  writer.write("\n\t");
                }
              }
              writer.write('\n');
            }
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      writer.close();
    }
  }