// public Vector encode(String type, List<String> data, String label) throws IOException public static org.apache.mahout.math.Vector encode(String label, List<String> data) throws IOException { FeatureVectorEncoder content_encoder = new AdaptiveWordValueEncoder("content"); content_encoder.setProbes(2); org.apache.mahout.math.Vector v = new RandomAccessSparseVector(100); for (String word : data) { content_encoder.addToVector(word, v); } return new NamedVector(v, label); }
@Test public void testAddToVector() { FeatureVectorEncoder enc = new ContinuousValueEncoder("foo"); Vector v1 = new DenseVector(20); enc.addToVector("-123", v1); Assert.assertEquals(-123, v1.minValue(), 0); Assert.assertEquals(0, v1.maxValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); v1 = new DenseVector(20); enc.addToVector("123", v1); Assert.assertEquals(123, v1.maxValue(), 0); Assert.assertEquals(0, v1.minValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); Vector v2 = new DenseVector(20); enc.setProbes(2); enc.addToVector("123", v2); Assert.assertEquals(123, v2.maxValue(), 0); Assert.assertEquals(2 * 123, v2.norm(1), 0); v1 = v2.minus(v1); Assert.assertEquals(123, v1.maxValue(), 0); Assert.assertEquals(123, v1.norm(1), 0); Vector v3 = new DenseVector(20); enc.setProbes(2); enc.addToVector("100", v3); v1 = v2.minus(v3); Assert.assertEquals(23, v1.maxValue(), 0); Assert.assertEquals(2 * 23, v1.norm(1), 0); enc.addToVector("7", v1); Assert.assertEquals(30, v1.maxValue(), 0); Assert.assertEquals(2 * 30, v1.norm(1), 0); Assert.assertEquals(30, v1.get(10), 0); Assert.assertEquals(30, v1.get(18), 0); try { enc.addToVector("foobar", v1); Assert.fail("Should have noticed bad numeric format"); } catch (NumberFormatException e) { Assert.assertEquals("For input string: \"foobar\"", e.getMessage()); } }