@Test
  public void testLuceneEncoding() throws Exception {
    LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
    enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_43));
    Vector v1 = new DenseVector(200);
    enc.addToVector("test1 and more", v1);
    enc.flush(1, v1);

    // should be the same as text test above, since we are splitting on whitespace
    // should set 6 distinct locations to 1
    assertEquals(6.0, v1.norm(1), 0);
    assertEquals(1.0, v1.maxValue(), 0);

    v1 = new DenseVector(200);
    enc.addToVector("", v1);
    enc.flush(1, v1);
    assertEquals(0.0, v1.norm(1), 0);
    assertEquals(0.0, v1.maxValue(), 0);

    v1 = new DenseVector(200);
    StringBuilder builder = new StringBuilder(5000);
    for (int i = 0;
        i < 1000;
        i++) { // lucene's internal buffer length request is 4096, so let's make sure we can handle
      // larger size
      builder.append("token_").append(i).append(' ');
    }
    enc.addToVector(builder.toString(), v1);
    enc.flush(1, v1);
    // System.out.println(v1);
    assertEquals(2000.0, v1.norm(1), 0);
    assertEquals(19.0, v1.maxValue(), 0);
  }
Ejemplo n.º 2
0
 public void assertColumnNormsEqualOrZero(Matrix expected, Matrix actual) {
   assertNotNull(actual);
   assertEquals(expected.columnSize(), actual.columnSize());
   for (int c = 0; c < expected.columnSize(); ++c) {
     Vector expectedColumn = expected.viewColumn(c);
     Vector actualColumn = actual.viewColumn(c);
     assertNotNull(actualColumn);
     double expectedNorm = expectedColumn.norm(1);
     double actualNorm = actualColumn.norm(1);
     if (actualNorm == 0) {
       continue;
     }
     assertEquals(expectedNorm, actualNorm, 1e-10);
   }
 }
  @Test
  public void testAddToVector() {
    TextValueEncoder enc = new TextValueEncoder("text");
    Vector v1 = new DenseVector(200);
    enc.addToVector("test1 and more", v1);
    enc.flush(1, v1);
    // should set 6 distinct locations to 1
    assertEquals(6.0, v1.norm(1), 0);
    assertEquals(1.0, v1.maxValue(), 0);

    // now some fancy weighting
    StaticWordValueEncoder w = new StaticWordValueEncoder("text");
    w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 1.5));
    enc.setWordEncoder(w);

    // should set 6 locations to something
    Vector v2 = new DenseVector(200);
    enc.addToVector("test1 and more", v2);
    enc.flush(1, v2);

    // this should set the same 6 locations to the same values
    Vector v3 = new DenseVector(200);
    w.addToVector("test1", v3);
    w.addToVector("and", v3);
    w.addToVector("more", v3);
    assertEquals(0, v3.minus(v2).norm(1), 0);

    // moreover, the locations set in the unweighted case should be the same as in the weighted case
    assertEquals(v3.zSum(), v3.dot(v1), 0);
  }
 public static void assertOrthonormal(Matrix currentEigens, double errorMargin) {
   for (int i = 0; i < currentEigens.numRows(); i++) {
     Vector ei = currentEigens.getRow(i);
     for (int j = 0; j <= i; j++) {
       Vector ej = currentEigens.getRow(j);
       if (ei.norm(2) == 0 || ej.norm(2) == 0) {
         continue;
       }
       double dot = ei.dot(ej);
       if (i == j) {
         assertTrue(
             "not norm 1 : " + dot + " (eigen #" + i + ')', (Math.abs(1 - dot) < errorMargin));
       } else {
         assertTrue(
             "not orthogonal : " + dot + " (eigens " + i + ", " + j + ')',
             Math.abs(dot) < errorMargin);
       }
     }
   }
 }
Ejemplo n.º 5
0
  @Test
  public void testInitialization() {
    // start with super clusterable data
    List<? extends WeightedVector> data = cubishTestData(0.01);

    // just do initialization of ball k-means.  This should drop a point into each of the clusters
    BallKMeans r = new BallKMeans(new BruteSearch(new EuclideanDistanceMeasure()), 6, 20);
    r.cluster(data);

    // put the centroids into a matrix
    Matrix x = new DenseMatrix(6, 5);
    int row = 0;
    for (Centroid c : r) {
      x.viewRow(row).assign(c.viewPart(0, 5));
      row++;
    }

    // verify that each column looks right.  Should contain zeros except for a single 6.
    final Vector columnNorms =
        x.aggregateColumns(
            new VectorFunction() {
              @Override
              public double apply(Vector f) {
                // return the sum of three discrepancy measures
                return Math.abs(f.minValue())
                    + Math.abs(f.maxValue() - 6)
                    + Math.abs(f.norm(1) - 6);
              }
            });
    // verify all errors are nearly zero
    assertEquals(0, columnNorms.norm(1) / columnNorms.size(), 0.1);

    // verify that the centroids are a permutation of the original ones
    SingularValueDecomposition svd = new SingularValueDecomposition(x);
    Vector s = svd.getS().viewDiagonal().assign(Functions.div(6));
    assertEquals(5, s.getLengthSquared(), 0.05);
    assertEquals(5, s.norm(1), 0.05);
  }
 private void postInitCorpus() {
   totalCorpusWeight = 0;
   int numNonZero = 0;
   for (int i = 0; i < numDocuments; i++) {
     Vector v = corpusWeights.viewRow(i);
     double norm;
     if (v != null && (norm = v.norm(1)) != 0) {
       numNonZero += v.getNumNondefaultElements();
       totalCorpusWeight += norm;
     }
   }
   String s =
       "Initializing corpus with %d docs, %d terms, %d nonzero entries, total termWeight %f";
   log.info(String.format(s, numDocuments, numTerms, numNonZero, totalCorpusWeight));
 }
  @Test
  public void testAddToVector() {
    FeatureVectorEncoder enc = new ContinuousValueEncoder("foo");
    Vector v1 = new DenseVector(20);
    enc.addToVector("-123", v1);
    Assert.assertEquals(-123, v1.minValue(), 0);
    Assert.assertEquals(0, v1.maxValue(), 0);
    Assert.assertEquals(123, v1.norm(1), 0);

    v1 = new DenseVector(20);
    enc.addToVector("123", v1);
    Assert.assertEquals(123, v1.maxValue(), 0);
    Assert.assertEquals(0, v1.minValue(), 0);
    Assert.assertEquals(123, v1.norm(1), 0);

    Vector v2 = new DenseVector(20);
    enc.setProbes(2);
    enc.addToVector("123", v2);
    Assert.assertEquals(123, v2.maxValue(), 0);
    Assert.assertEquals(2 * 123, v2.norm(1), 0);

    v1 = v2.minus(v1);
    Assert.assertEquals(123, v1.maxValue(), 0);
    Assert.assertEquals(123, v1.norm(1), 0);

    Vector v3 = new DenseVector(20);
    enc.setProbes(2);
    enc.addToVector("100", v3);
    v1 = v2.minus(v3);
    Assert.assertEquals(23, v1.maxValue(), 0);
    Assert.assertEquals(2 * 23, v1.norm(1), 0);

    enc.addToVector("7", v1);
    Assert.assertEquals(30, v1.maxValue(), 0);
    Assert.assertEquals(2 * 30, v1.norm(1), 0);
    Assert.assertEquals(30, v1.get(10), 0);
    Assert.assertEquals(30, v1.get(18), 0);

    try {
      enc.addToVector("foobar", v1);
      Assert.fail("Should have noticed bad numeric format");
    } catch (NumberFormatException e) {
      Assert.assertEquals("For input string: \"foobar\"", e.getMessage());
    }
  }