@Test public void testLuceneEncoding() throws Exception { LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text"); enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_43)); Vector v1 = new DenseVector(200); enc.addToVector("test1 and more", v1); enc.flush(1, v1); // should be the same as text test above, since we are splitting on whitespace // should set 6 distinct locations to 1 assertEquals(6.0, v1.norm(1), 0); assertEquals(1.0, v1.maxValue(), 0); v1 = new DenseVector(200); enc.addToVector("", v1); enc.flush(1, v1); assertEquals(0.0, v1.norm(1), 0); assertEquals(0.0, v1.maxValue(), 0); v1 = new DenseVector(200); StringBuilder builder = new StringBuilder(5000); for (int i = 0; i < 1000; i++) { // lucene's internal buffer length request is 4096, so let's make sure we can handle // larger size builder.append("token_").append(i).append(' '); } enc.addToVector(builder.toString(), v1); enc.flush(1, v1); // System.out.println(v1); assertEquals(2000.0, v1.norm(1), 0); assertEquals(19.0, v1.maxValue(), 0); }
public void assertColumnNormsEqualOrZero(Matrix expected, Matrix actual) { assertNotNull(actual); assertEquals(expected.columnSize(), actual.columnSize()); for (int c = 0; c < expected.columnSize(); ++c) { Vector expectedColumn = expected.viewColumn(c); Vector actualColumn = actual.viewColumn(c); assertNotNull(actualColumn); double expectedNorm = expectedColumn.norm(1); double actualNorm = actualColumn.norm(1); if (actualNorm == 0) { continue; } assertEquals(expectedNorm, actualNorm, 1e-10); } }
@Test public void testAddToVector() { TextValueEncoder enc = new TextValueEncoder("text"); Vector v1 = new DenseVector(200); enc.addToVector("test1 and more", v1); enc.flush(1, v1); // should set 6 distinct locations to 1 assertEquals(6.0, v1.norm(1), 0); assertEquals(1.0, v1.maxValue(), 0); // now some fancy weighting StaticWordValueEncoder w = new StaticWordValueEncoder("text"); w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5)); enc.setWordEncoder(w); // should set 6 locations to something Vector v2 = new DenseVector(200); enc.addToVector("test1 and more", v2); enc.flush(1, v2); // this should set the same 6 locations to the same values Vector v3 = new DenseVector(200); w.addToVector("test1", v3); w.addToVector("and", v3); w.addToVector("more", v3); assertEquals(0, v3.minus(v2).norm(1), 0); // moreover, the locations set in the unweighted case should be the same as in the weighted case assertEquals(v3.zSum(), v3.dot(v1), 0); }
public static void assertOrthonormal(Matrix currentEigens, double errorMargin) { for (int i = 0; i < currentEigens.numRows(); i++) { Vector ei = currentEigens.getRow(i); for (int j = 0; j <= i; j++) { Vector ej = currentEigens.getRow(j); if (ei.norm(2) == 0 || ej.norm(2) == 0) { continue; } double dot = ei.dot(ej); if (i == j) { assertTrue( "not norm 1 : " + dot + " (eigen #" + i + ')', (Math.abs(1 - dot) < errorMargin)); } else { assertTrue( "not orthogonal : " + dot + " (eigens " + i + ", " + j + ')', Math.abs(dot) < errorMargin); } } } }
@Test public void testInitialization() { // start with super clusterable data List<? extends WeightedVector> data = cubishTestData(0.01); // just do initialization of ball k-means. This should drop a point into each of the clusters BallKMeans r = new BallKMeans(new BruteSearch(new EuclideanDistanceMeasure()), 6, 20); r.cluster(data); // put the centroids into a matrix Matrix x = new DenseMatrix(6, 5); int row = 0; for (Centroid c : r) { x.viewRow(row).assign(c.viewPart(0, 5)); row++; } // verify that each column looks right. Should contain zeros except for a single 6. final Vector columnNorms = x.aggregateColumns( new VectorFunction() { @Override public double apply(Vector f) { // return the sum of three discrepancy measures return Math.abs(f.minValue()) + Math.abs(f.maxValue() - 6) + Math.abs(f.norm(1) - 6); } }); // verify all errors are nearly zero assertEquals(0, columnNorms.norm(1) / columnNorms.size(), 0.1); // verify that the centroids are a permutation of the original ones SingularValueDecomposition svd = new SingularValueDecomposition(x); Vector s = svd.getS().viewDiagonal().assign(Functions.div(6)); assertEquals(5, s.getLengthSquared(), 0.05); assertEquals(5, s.norm(1), 0.05); }
private void postInitCorpus() { totalCorpusWeight = 0; int numNonZero = 0; for (int i = 0; i < numDocuments; i++) { Vector v = corpusWeights.viewRow(i); double norm; if (v != null && (norm = v.norm(1)) != 0) { numNonZero += v.getNumNondefaultElements(); totalCorpusWeight += norm; } } String s = "Initializing corpus with %d docs, %d terms, %d nonzero entries, total termWeight %f"; log.info(String.format(s, numDocuments, numTerms, numNonZero, totalCorpusWeight)); }
@Test public void testAddToVector() { FeatureVectorEncoder enc = new ContinuousValueEncoder("foo"); Vector v1 = new DenseVector(20); enc.addToVector("-123", v1); Assert.assertEquals(-123, v1.minValue(), 0); Assert.assertEquals(0, v1.maxValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); v1 = new DenseVector(20); enc.addToVector("123", v1); Assert.assertEquals(123, v1.maxValue(), 0); Assert.assertEquals(0, v1.minValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); Vector v2 = new DenseVector(20); enc.setProbes(2); enc.addToVector("123", v2); Assert.assertEquals(123, v2.maxValue(), 0); Assert.assertEquals(2 * 123, v2.norm(1), 0); v1 = v2.minus(v1); Assert.assertEquals(123, v1.maxValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); Vector v3 = new DenseVector(20); enc.setProbes(2); enc.addToVector("100", v3); v1 = v2.minus(v3); Assert.assertEquals(23, v1.maxValue(), 0); Assert.assertEquals(2 * 23, v1.norm(1), 0); enc.addToVector("7", v1); Assert.assertEquals(30, v1.maxValue(), 0); Assert.assertEquals(2 * 30, v1.norm(1), 0); Assert.assertEquals(30, v1.get(10), 0); Assert.assertEquals(30, v1.get(18), 0); try { enc.addToVector("foobar", v1); Assert.fail("Should have noticed bad numeric format"); } catch (NumberFormatException e) { Assert.assertEquals("For input string: \"foobar\"", e.getMessage()); } }