@Override public Matrix getDiagonalMatrix() { if (diagonalMatrix == null) { diagonalMatrix = new DenseMatrix(desiredRank, desiredRank); } if (diagonalMatrix.get(0, 1) <= 0) { try { Vector norms = fetchVector(new Path(baseDir, "norms"), 0); Vector projections = fetchVector(new Path(baseDir, "projections"), 0); if (norms != null && projections != null) { int i = 0; while (i < projections.size() - 1) { diagonalMatrix.set(i, i, projections.get(i)); diagonalMatrix.set(i, i + 1, norms.get(i)); diagonalMatrix.set(i + 1, i, norms.get(i)); i++; } diagonalMatrix.set(i, i, projections.get(i)); } } catch (IOException e) { log.error("Could not load diagonal matrix of norms and projections: ", e); } } return diagonalMatrix; }
@Override public double pdf(VectorWritable v) { Vector x = v.get(); // return the product of the component pdfs // TODO: is this reasonable? correct? double pdf = pdf(x, stdDev.get(0)); for (int i = 1; i < x.size(); i++) { pdf *= pdf(x, stdDev.get(i)); } return pdf; }
/** * Initializes training. Runs through all data points in the training set and updates the weight * vector whenever a classification error occurs. * * <p>Can be called multiple times. * * @param dataset the dataset to train on. Each column is treated as point. * @param labelset the set of labels, one for each data point. If the cardinalities of data- and * labelset do not match, a CardinalityException is thrown */ public void train(Vector labelset, Matrix dataset) throws TrainingException { if (labelset.size() != dataset.columnSize()) { throw new CardinalityException(labelset.size(), dataset.columnSize()); } boolean converged = false; int iteration = 0; while (!converged) { if (iteration > 1000) { throw new TrainingException("Too many iterations needed to find hyperplane."); } converged = true; int columnCount = dataset.columnSize(); for (int i = 0; i < columnCount; i++) { Vector dataPoint = dataset.viewColumn(i); log.debug("Training point: {}", dataPoint); synchronized (this.model) { boolean prediction = model.classify(dataPoint); double label = labelset.get(i); if (label <= 0 && prediction || label > 0 && !prediction) { log.debug("updating"); converged = false; update(label, dataPoint, this.model); } } } iteration++; } }
static void mainToOutput(String[] args, PrintWriter output) throws Exception { if (!parseArgs(args)) { return; } AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters.loadFromFile(new File(modelFile)); CsvRecordFactory csv = lmp.getCsvRecordFactory(); csv.setIdName(idColumn); AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression(); State<Wrapper, CrossFoldLearner> best = lr.getBest(); if (best == null) { output.println("AdaptiveLogisticRegression has not be trained probably."); return; } CrossFoldLearner learner = best.getPayload().getLearner(); BufferedReader in = TrainAdaptiveLogistic.open(inputFile); BufferedWriter out = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outputFile), Charsets.UTF_8)); out.write(idColumn + ",target,score"); out.newLine(); String line = in.readLine(); csv.firstLine(line); line = in.readLine(); Map<String, Double> results = new HashMap<String, Double>(); int k = 0; while (line != null) { Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures()); csv.processLine(line, v, false); Vector scores = learner.classifyFull(v); results.clear(); if (maxScoreOnly) { results.put(csv.getTargetLabel(scores.maxValueIndex()), scores.maxValue()); } else { for (int i = 0; i < scores.size(); i++) { results.put(csv.getTargetLabel(i), scores.get(i)); } } for (Map.Entry<String, Double> entry : results.entrySet()) { out.write(csv.getIdString(line) + ',' + entry.getKey() + ',' + entry.getValue()); out.newLine(); } k++; if (k % 100 == 0) { output.println(k + " records processed"); } line = in.readLine(); } out.flush(); out.close(); output.println(k + " records processed totally."); }
public void processVectors( List<Pair<Integer, NamedVector>> vectors, boolean train, SequenceFile.Writer writer) throws IOException { BytesWritable recordKey = new BytesWritable("".getBytes()); Text value = new Text(); if (train) Collections.shuffle(vectors); double mu = -1.0; double ll = -1.0; int actual = -1; for (Pair<Integer, NamedVector> pair : vectors) { NamedVector v = pair.getValue(); if (train) { actual = pair.getKey(); mu = Math.min(this.k + 1, 200); ll = learningAlgorithm.logLikelihood(actual, v); this.averageLL = this.averageLL + (ll - this.averageLL) / mu; } Vector p = new DenseVector(LABELS); learningAlgorithm.classifyFull(p, v); int estimated = p.maxValueIndex(); this.counts[estimated]++; if (writer != null) { value.set( String.format( "%s%c%d%c01%f", v.getName(), SEQUENCE_FILE_FIELD_SEPARATOR, estimated, SEQUENCE_FILE_FIELD_SEPARATOR, p.get(estimated))); writer.append(recordKey, value); } if (train) { int correct = (estimated == actual ? 1 : 0); this.averageCorrect = this.averageCorrect + (correct - this.averageCorrect) / mu; learningAlgorithm.train(actual, v); learningAlgorithm.close(); } this.k++; int bump = this.BUMPS[(int) Math.floor(this.step) % this.BUMPS.length]; int scale = (int) Math.pow(10, Math.floor(this.step / this.BUMPS.length)); if (this.k % Math.min(MAX_STEP, bump * scale) == 0) { this.step += 0.25; if (train) System.out.printf( "%10d %10.3f %10.3f %10.2f %d\n", this.k, ll, this.averageLL, this.averageCorrect * 100, estimated); else System.out.printf("%c%10d, per label: %s\n", CR, this.k, Arrays.toString(this.counts)); } } }
private static String getFormatedOutput(VectorWritable vw) { String formatedString = ""; int formatWidth = 8; Vector vector = vw.get(); for (int i = 0; i < vector.size(); ++i) { formatedString += String.format("%" + Integer.toString(formatWidth) + ".4f", vector.get(i)); } return formatedString; }
private boolean consider(Vector.Element occurrenceA, Vector.Element occurrenceB) { int numNonZeroEntriesA = numNonZeroEntries.get(occurrenceA.index()); int numNonZeroEntriesB = numNonZeroEntries.get(occurrenceB.index()); double maxValueA = maxValues.get(occurrenceA.index()); double maxValueB = maxValues.get(occurrenceB.index()); return similarity.consider( numNonZeroEntriesA, numNonZeroEntriesB, maxValueA, maxValueB, threshold); }
/** checks whether the {@link Vector} is equivalent to the set of {@link Vector.Element}s */ public static boolean consistsOf(Vector vector, Vector.Element... elements) { if (elements.length != numberOfNoNZeroNonNaNElements(vector)) { return false; } for (Vector.Element element : elements) { if (Math.abs(element.get() - vector.get(element.index())) > MahoutTestCase.EPSILON) { return false; } } return true; }
/** * Return a human-readable formatted string representation of the vector, not intended to be * complete nor usable as an input/output representation */ public static String formatVector(Vector v, String[] bindings) { StringBuilder buf = new StringBuilder(); if (v instanceof NamedVector) { buf.append(((NamedVector) v).getName()).append(" = "); } int nzero = 0; Iterator<Vector.Element> iterateNonZero = v.iterateNonZero(); while (iterateNonZero.hasNext()) { iterateNonZero.next(); nzero++; } // if vector is sparse or if we have bindings, use sparse notation if (nzero < v.size() || bindings != null) { buf.append('['); for (int i = 0; i < v.size(); i++) { double elem = v.get(i); if (elem == 0.0) { continue; } String label; if (bindings != null && (label = bindings[i]) != null) { buf.append(label).append(':'); } else { buf.append(i).append(':'); } buf.append(String.format(Locale.ENGLISH, "%.3f", elem)).append(", "); } } else { buf.append('['); for (int i = 0; i < v.size(); i++) { double elem = v.get(i); buf.append(String.format(Locale.ENGLISH, "%.3f", elem)).append(", "); } } if (buf.length() > 1) { buf.setLength(buf.length() - 2); } buf.append(']'); return buf.toString(); }
@Override public double getScaleFactor() { if (scaleFactor <= 0) { try { Vector v = fetchVector(new Path(baseDir, "scaleFactor"), 0); if (v != null && v.size() > 0) { scaleFactor = v.get(0); } } catch (IOException e) { log.error("could not load scaleFactor:", e); } } return scaleFactor; }
public static Matrix sampledCorpus( Matrix matrix, Random random, int numDocs, int numSamples, int numTopicsPerDoc) { Matrix corpus = new SparseRowMatrix(numDocs, matrix.numCols()); LDASampler modelSampler = new LDASampler(matrix, random); Vector topicVector = new DenseVector(matrix.numRows()); for (int i = 0; i < numTopicsPerDoc; i++) { int topic = random.nextInt(topicVector.size()); topicVector.set(topic, topicVector.get(topic) + 1); } for (int docId = 0; docId < numDocs; docId++) { for (int sample : modelSampler.sample(topicVector, numSamples)) { corpus.set(docId, sample, corpus.get(docId, sample) + 1); } } return corpus; }
@Test public void testAddToVector() { FeatureVectorEncoder enc = new ContinuousValueEncoder("foo"); Vector v1 = new DenseVector(20); enc.addToVector("-123", v1); Assert.assertEquals(-123, v1.minValue(), 0); Assert.assertEquals(0, v1.maxValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); v1 = new DenseVector(20); enc.addToVector("123", v1); Assert.assertEquals(123, v1.maxValue(), 0); Assert.assertEquals(0, v1.minValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); Vector v2 = new DenseVector(20); enc.setProbes(2); enc.addToVector("123", v2); Assert.assertEquals(123, v2.maxValue(), 0); Assert.assertEquals(2 * 123, v2.norm(1), 0); v1 = v2.minus(v1); Assert.assertEquals(123, v1.maxValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); Vector v3 = new DenseVector(20); enc.setProbes(2); enc.addToVector("100", v3); v1 = v2.minus(v3); Assert.assertEquals(23, v1.maxValue(), 0); Assert.assertEquals(2 * 23, v1.norm(1), 0); enc.addToVector("7", v1); Assert.assertEquals(30, v1.maxValue(), 0); Assert.assertEquals(2 * 30, v1.norm(1), 0); Assert.assertEquals(30, v1.get(10), 0); Assert.assertEquals(30, v1.get(18), 0); try { enc.addToVector("foobar", v1); Assert.fail("Should have noticed bad numeric format"); } catch (NumberFormatException e) { Assert.assertEquals("For input string: \"foobar\"", e.getMessage()); } }
@Test public void testMatrixDiagonalizeReducer() throws Exception { MatrixDiagonalizeMapper mapper = new MatrixDiagonalizeMapper(); Configuration conf = getConfiguration(); conf.setInt(Keys.AFFINITY_DIMENSIONS, RAW_DIMENSIONS); // set up the dummy writers DummyRecordWriter<NullWritable, IntDoublePairWritable> mapWriter = new DummyRecordWriter<>(); Mapper<IntWritable, VectorWritable, NullWritable, IntDoublePairWritable>.Context mapContext = DummyRecordWriter.build(mapper, conf, mapWriter); // perform the mapping for (int i = 0; i < RAW_DIMENSIONS; i++) { RandomAccessSparseVector toAdd = new RandomAccessSparseVector(RAW_DIMENSIONS); toAdd.assign(RAW[i]); mapper.map(new IntWritable(i), new VectorWritable(toAdd), mapContext); } // now perform the reduction MatrixDiagonalizeReducer reducer = new MatrixDiagonalizeReducer(); DummyRecordWriter<NullWritable, VectorWritable> redWriter = new DummyRecordWriter<>(); Reducer<NullWritable, IntDoublePairWritable, NullWritable, VectorWritable>.Context redContext = DummyRecordWriter.build( reducer, conf, redWriter, NullWritable.class, IntDoublePairWritable.class); // only need one reduction reducer.reduce(NullWritable.get(), mapWriter.getValue(NullWritable.get()), redContext); // first, make sure there's only one result List<VectorWritable> list = redWriter.getValue(NullWritable.get()); assertEquals("Only a single resulting vector", 1, list.size()); Vector v = list.get(0).get(); for (int i = 0; i < v.size(); i++) { assertEquals("Element sum is correct", rowSum(RAW[i]), v.get(i), 0.01); } }
public double thetaNormalizer(int label) { return perlabelThetaNormalizer.get(label); }