public Weight(String feature, Vector weights, int n) { this.feature = feature; // pick out the weight with the largest abs value, but don't forget the sign Queue<Category> biggest = new PriorityQueue<Category>(n + 1, Ordering.natural()); for (Vector.Element element : weights.all()) { biggest.add(new Category(element.index(), element.get())); while (biggest.size() > n) { biggest.poll(); } } categories = Lists.newArrayList(biggest); Collections.sort(categories, Ordering.natural().reverse()); value = categories.get(0).weight; maxIndex = categories.get(0).index; }
@Test public void testAffinitymatrixInputReducer() throws Exception { AffinityMatrixInputMapper mapper = new AffinityMatrixInputMapper(); Configuration conf = getConfiguration(); conf.setInt(Keys.AFFINITY_DIMENSIONS, RAW_DIMENSIONS); // set up the dummy writer and the M/R context DummyRecordWriter<IntWritable, MatrixEntryWritable> mapWriter = new DummyRecordWriter<>(); Mapper<LongWritable, Text, IntWritable, MatrixEntryWritable>.Context mapContext = DummyRecordWriter.build(mapper, conf, mapWriter); // loop through all the points and test each one is converted // successfully to a DistributedRowMatrix.MatrixEntry for (String s : RAW) { mapper.map(new LongWritable(), new Text(s), mapContext); } // store the data for checking later Map<IntWritable, List<MatrixEntryWritable>> map = mapWriter.getData(); // now reduce the data AffinityMatrixInputReducer reducer = new AffinityMatrixInputReducer(); DummyRecordWriter<IntWritable, VectorWritable> redWriter = new DummyRecordWriter<>(); Reducer<IntWritable, MatrixEntryWritable, IntWritable, VectorWritable>.Context redContext = DummyRecordWriter.build( reducer, conf, redWriter, IntWritable.class, MatrixEntryWritable.class); for (IntWritable key : mapWriter.getKeys()) { reducer.reduce(key, mapWriter.getValue(key), redContext); } // check that all the elements are correctly ordered assertEquals("Number of reduce results", RAW_DIMENSIONS, redWriter.getData().size()); for (IntWritable row : redWriter.getKeys()) { List<VectorWritable> list = redWriter.getValue(row); assertEquals("Should only be one vector", 1, list.size()); // check that the elements in the array are correctly ordered Vector v = list.get(0).get(); for (Vector.Element e : v.all()) { // find this value in the original map MatrixEntryWritable toCompare = new MatrixEntryWritable(); toCompare.setRow(-1); toCompare.setCol(e.index()); toCompare.setVal(e.get()); assertTrue("This entry was correctly placed in its row", map.get(row).contains(toCompare)); } } }
public static String nice(Vector v) { if (!v.isSequentialAccess()) { v = new DenseVector(v); } DecimalFormat df = new DecimalFormat("0.00", DecimalFormatSymbols.getInstance(Locale.ENGLISH)); StringBuilder buffer = new StringBuilder("["); String separator = ""; for (Vector.Element e : v.all()) { buffer.append(separator); if (Double.isNaN(e.get())) { buffer.append(" - "); } else { if (e.get() >= 0) { buffer.append(' '); } buffer.append(df.format(e.get())); } separator = "\t"; } buffer.append(" ]"); return buffer.toString(); }
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println( "Arguments: [model] [label index] [dictionnary] [document frequency] [Customer description]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String carsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(carsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 47); String cmplid = tokens[0]; String cdescr = tokens[19]; System.out.println("Complaint id: " + cmplid + "\t" + cdescr); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from complaint description TokenStream ts = analyzer.tokenStream("text", new StringReader(cdescr)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(1000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
/** * Fairly straightforward: the task here is to reassemble the rows of the affinity matrix. The * tricky part is that any specific element in the list of elements which does NOT lay on the * diagonal will be so because it did not drop below the sensitivity threshold, hence it was not * "cut". * * <p>On the flip side, there will be many entries whose coordinate is now set to the diagonal, * indicating they were previously affinity entries whose sensitivities were below the threshold, * and hence were "cut" - set to 0 at their original coordinates, and had their values added to * the diagonal entry (hence the numerous entries with the coordinate of the diagonal). * * @throws Exception */ @Test public void testEigencutsAffinityCutsReducer() throws Exception { Configuration conf = new Configuration(); Path affinity = new Path("affinity"); Path sensitivity = new Path("sensitivity"); conf.set(EigencutsKeys.AFFINITY_PATH, affinity.getName()); conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, this.affinity.length); // since we need the working paths to distinguish the vertex types, // we can't use the mapper (since we have no way of manually setting // the Context.workingPath() ) Map<Text, List<VertexWritable>> data = buildMapData(affinity, sensitivity, this.sensitivity); // now, set up the combiner EigencutsAffinityCutsCombiner combiner = new EigencutsAffinityCutsCombiner(); DummyRecordWriter<Text, VertexWritable> comWriter = new DummyRecordWriter<Text, VertexWritable>(); Reducer<Text, VertexWritable, Text, VertexWritable>.Context comContext = DummyRecordWriter.build(combiner, conf, comWriter, Text.class, VertexWritable.class); // perform the combining for (Map.Entry<Text, List<VertexWritable>> entry : data.entrySet()) { combiner.reduce(entry.getKey(), entry.getValue(), comContext); } // finally, set up the reduction writers EigencutsAffinityCutsReducer reducer = new EigencutsAffinityCutsReducer(); DummyRecordWriter<IntWritable, VectorWritable> redWriter = new DummyRecordWriter<IntWritable, VectorWritable>(); Reducer<Text, VertexWritable, IntWritable, VectorWritable>.Context redContext = DummyRecordWriter.build(reducer, conf, redWriter, Text.class, VertexWritable.class); // perform the reduction for (Text key : comWriter.getKeys()) { reducer.reduce(key, comWriter.getValue(key), redContext); } // now, check that the affinity matrix is correctly formed for (IntWritable row : redWriter.getKeys()) { List<VectorWritable> results = redWriter.getValue(row); // there should only be 1 vector assertEquals("Only one vector with a given row number", 1, results.size()); Vector therow = results.get(0).get(); for (Vector.Element e : therow.all()) { // check the diagonal if (row.get() == e.index()) { assertEquals( "Correct diagonal sum of cuts", sumOfRowCuts(row.get(), this.sensitivity), e.get(), EPSILON); } else { // not on the diagonal...if it was an element labeled to be cut, // it should have a value of 0. Otherwise, it should have kept its // previous value if (this.sensitivity[row.get()][e.index()] == 0.0) { // should be what it was originally assertEquals( "Preserved element", this.affinity[row.get()][e.index()], e.get(), EPSILON); } else { // should be 0 assertEquals("Cut element", 0.0, e.get(), EPSILON); } } } } }