コード例 #1
0
 public Weight(String feature, Vector weights, int n) {
   this.feature = feature;
   // pick out the weight with the largest abs value, but don't forget the sign
   Queue<Category> biggest = new PriorityQueue<Category>(n + 1, Ordering.natural());
   for (Vector.Element element : weights.all()) {
     biggest.add(new Category(element.index(), element.get()));
     while (biggest.size() > n) {
       biggest.poll();
     }
   }
   categories = Lists.newArrayList(biggest);
   Collections.sort(categories, Ordering.natural().reverse());
   value = categories.get(0).weight;
   maxIndex = categories.get(0).index;
 }
コード例 #2
0
  @Test
  public void testAffinitymatrixInputReducer() throws Exception {
    AffinityMatrixInputMapper mapper = new AffinityMatrixInputMapper();
    Configuration conf = getConfiguration();
    conf.setInt(Keys.AFFINITY_DIMENSIONS, RAW_DIMENSIONS);

    // set up the dummy writer and the M/R context
    DummyRecordWriter<IntWritable, MatrixEntryWritable> mapWriter = new DummyRecordWriter<>();
    Mapper<LongWritable, Text, IntWritable, MatrixEntryWritable>.Context mapContext =
        DummyRecordWriter.build(mapper, conf, mapWriter);

    // loop through all the points and test each one is converted
    // successfully to a DistributedRowMatrix.MatrixEntry
    for (String s : RAW) {
      mapper.map(new LongWritable(), new Text(s), mapContext);
    }
    // store the data for checking later
    Map<IntWritable, List<MatrixEntryWritable>> map = mapWriter.getData();

    // now reduce the data
    AffinityMatrixInputReducer reducer = new AffinityMatrixInputReducer();
    DummyRecordWriter<IntWritable, VectorWritable> redWriter = new DummyRecordWriter<>();
    Reducer<IntWritable, MatrixEntryWritable, IntWritable, VectorWritable>.Context redContext =
        DummyRecordWriter.build(
            reducer, conf, redWriter, IntWritable.class, MatrixEntryWritable.class);
    for (IntWritable key : mapWriter.getKeys()) {
      reducer.reduce(key, mapWriter.getValue(key), redContext);
    }

    // check that all the elements are correctly ordered
    assertEquals("Number of reduce results", RAW_DIMENSIONS, redWriter.getData().size());
    for (IntWritable row : redWriter.getKeys()) {
      List<VectorWritable> list = redWriter.getValue(row);
      assertEquals("Should only be one vector", 1, list.size());
      // check that the elements in the array are correctly ordered
      Vector v = list.get(0).get();
      for (Vector.Element e : v.all()) {
        // find this value in the original map
        MatrixEntryWritable toCompare = new MatrixEntryWritable();
        toCompare.setRow(-1);
        toCompare.setCol(e.index());
        toCompare.setVal(e.get());
        assertTrue("This entry was correctly placed in its row", map.get(row).contains(toCompare));
      }
    }
  }
コード例 #3
0
ファイル: MathHelper.java プロジェクト: ChineseDr/mahout
  public static String nice(Vector v) {
    if (!v.isSequentialAccess()) {
      v = new DenseVector(v);
    }

    DecimalFormat df = new DecimalFormat("0.00", DecimalFormatSymbols.getInstance(Locale.ENGLISH));

    StringBuilder buffer = new StringBuilder("[");
    String separator = "";
    for (Vector.Element e : v.all()) {
      buffer.append(separator);
      if (Double.isNaN(e.get())) {
        buffer.append("  -  ");
      } else {
        if (e.get() >= 0) {
          buffer.append(' ');
        }
        buffer.append(df.format(e.get()));
      }
      separator = "\t";
    }
    buffer.append(" ]");
    return buffer.toString();
  }
コード例 #4
0
ファイル: Classifier.java プロジェクト: CS560/Project
  public static void main(String[] args) throws Exception {
    if (args.length < 5) {
      System.out.println(
          "Arguments: [model] [label index] [dictionnary] [document frequency] [Customer description]");
      return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String carsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels =
        BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency =
        readDocumentFrequency(configuration, new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(carsPath));
    while (true) {
      String line = reader.readLine();
      if (line == null) {
        break;
      }

      String[] tokens = line.split("\t", 47);
      String cmplid = tokens[0];
      String cdescr = tokens[19];

      System.out.println("Complaint id: " + cmplid + "\t" + cdescr);

      Multiset<String> words = ConcurrentHashMultiset.create();

      // extract words from complaint description
      TokenStream ts = analyzer.tokenStream("text", new StringReader(cdescr));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      int wordCount = 0;
      while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
          String word = ts.getAttribute(CharTermAttribute.class).toString();
          Integer wordId = dictionary.get(word);
          // if the word is not in the dictionary, skip it
          if (wordId != null) {
            words.add(word);
            wordCount++;
          }
        }
      }

      // create vector wordId => weight using tfidf
      Vector vector = new RandomAccessSparseVector(1000);
      TFIDF tfidf = new TFIDF();
      for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
      }
      // With the classifier, we get one score for each label
      // The label with the highest score is the one the tweet is more likely to
      // be associated to
      Vector resultVector = classifier.classifyFull(vector);
      double bestScore = -Double.MAX_VALUE;
      int bestCategoryId = -1;
      for (Element element : resultVector.all()) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
          bestScore = score;
          bestCategoryId = categoryId;
        }
        System.out.print("  " + labels.get(categoryId) + ": " + score);
      }
      System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
  }
コード例 #5
0
  /**
   * Fairly straightforward: the task here is to reassemble the rows of the affinity matrix. The
   * tricky part is that any specific element in the list of elements which does NOT lay on the
   * diagonal will be so because it did not drop below the sensitivity threshold, hence it was not
   * "cut".
   *
   * <p>On the flip side, there will be many entries whose coordinate is now set to the diagonal,
   * indicating they were previously affinity entries whose sensitivities were below the threshold,
   * and hence were "cut" - set to 0 at their original coordinates, and had their values added to
   * the diagonal entry (hence the numerous entries with the coordinate of the diagonal).
   *
   * @throws Exception
   */
  @Test
  public void testEigencutsAffinityCutsReducer() throws Exception {
    Configuration conf = new Configuration();
    Path affinity = new Path("affinity");
    Path sensitivity = new Path("sensitivity");
    conf.set(EigencutsKeys.AFFINITY_PATH, affinity.getName());
    conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, this.affinity.length);

    // since we need the working paths to distinguish the vertex types,
    // we can't use the mapper (since we have no way of manually setting
    // the Context.workingPath() )
    Map<Text, List<VertexWritable>> data = buildMapData(affinity, sensitivity, this.sensitivity);

    // now, set up the combiner
    EigencutsAffinityCutsCombiner combiner = new EigencutsAffinityCutsCombiner();
    DummyRecordWriter<Text, VertexWritable> comWriter =
        new DummyRecordWriter<Text, VertexWritable>();
    Reducer<Text, VertexWritable, Text, VertexWritable>.Context comContext =
        DummyRecordWriter.build(combiner, conf, comWriter, Text.class, VertexWritable.class);

    // perform the combining
    for (Map.Entry<Text, List<VertexWritable>> entry : data.entrySet()) {
      combiner.reduce(entry.getKey(), entry.getValue(), comContext);
    }

    // finally, set up the reduction writers
    EigencutsAffinityCutsReducer reducer = new EigencutsAffinityCutsReducer();
    DummyRecordWriter<IntWritable, VectorWritable> redWriter =
        new DummyRecordWriter<IntWritable, VectorWritable>();
    Reducer<Text, VertexWritable, IntWritable, VectorWritable>.Context redContext =
        DummyRecordWriter.build(reducer, conf, redWriter, Text.class, VertexWritable.class);

    // perform the reduction
    for (Text key : comWriter.getKeys()) {
      reducer.reduce(key, comWriter.getValue(key), redContext);
    }

    // now, check that the affinity matrix is correctly formed
    for (IntWritable row : redWriter.getKeys()) {
      List<VectorWritable> results = redWriter.getValue(row);
      // there should only be 1 vector
      assertEquals("Only one vector with a given row number", 1, results.size());
      Vector therow = results.get(0).get();
      for (Vector.Element e : therow.all()) {
        // check the diagonal
        if (row.get() == e.index()) {
          assertEquals(
              "Correct diagonal sum of cuts",
              sumOfRowCuts(row.get(), this.sensitivity),
              e.get(),
              EPSILON);
        } else {
          // not on the diagonal...if it was an element labeled to be cut,
          // it should have a value of 0. Otherwise, it should have kept its
          // previous value
          if (this.sensitivity[row.get()][e.index()] == 0.0) {
            // should be what it was originally
            assertEquals(
                "Preserved element", this.affinity[row.get()][e.index()], e.get(), EPSILON);
          } else {
            // should be 0
            assertEquals("Cut element", 0.0, e.get(), EPSILON);
          }
        }
      }
    }
  }