@Test public void testGroupByColumn() { List<String> data = Arrays.asList("a", "foo", "bar"); Dataset<String> ds = context.createDataset(data, Encoders.STRING()); GroupedDataset<Integer, String> grouped = ds.groupBy(length(col("value"))).keyAs(Encoders.INT()); Dataset<String> mapped = grouped.mapGroups( new MapGroupsFunction<Integer, String, String>() { @Override public String call(Integer key, Iterator<String> data) throws Exception { StringBuilder sb = new StringBuilder(key.toString()); while (data.hasNext()) { sb.append(data.next()); } return sb.toString(); } }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("1a", "3foobar"), mapped.collectAsList()); }
@Test public void testGroupBy() { List<String> data = Arrays.asList("a", "foo", "bar"); Dataset<String> ds = context.createDataset(data, Encoders.STRING()); GroupedDataset<Integer, String> grouped = ds.groupBy( new MapFunction<String, Integer>() { @Override public Integer call(String v) throws Exception { return v.length(); } }, Encoders.INT()); Dataset<String> mapped = grouped.mapGroups( new MapGroupsFunction<Integer, String, String>() { @Override public String call(Integer key, Iterator<String> values) throws Exception { StringBuilder sb = new StringBuilder(key.toString()); while (values.hasNext()) { sb.append(values.next()); } return sb.toString(); } }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("1a", "3foobar"), mapped.collectAsList()); Dataset<String> flatMapped = grouped.flatMapGroups( new FlatMapGroupsFunction<Integer, String, String>() { @Override public Iterable<String> call(Integer key, Iterator<String> values) throws Exception { StringBuilder sb = new StringBuilder(key.toString()); while (values.hasNext()) { sb.append(values.next()); } return Collections.singletonList(sb.toString()); } }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("1a", "3foobar"), flatMapped.collectAsList()); Dataset<Tuple2<Integer, String>> reduced = grouped.reduce( new ReduceFunction<String>() { @Override public String call(String v1, String v2) throws Exception { return v1 + v2; } }); Assert.assertEquals( Arrays.asList(tuple2(1, "a"), tuple2(3, "foobar")), reduced.collectAsList()); List<Integer> data2 = Arrays.asList(2, 6, 10); Dataset<Integer> ds2 = context.createDataset(data2, Encoders.INT()); GroupedDataset<Integer, Integer> grouped2 = ds2.groupBy( new MapFunction<Integer, Integer>() { @Override public Integer call(Integer v) throws Exception { return v / 2; } }, Encoders.INT()); Dataset<String> cogrouped = grouped.cogroup( grouped2, new CoGroupFunction<Integer, String, Integer, String>() { @Override public Iterable<String> call( Integer key, Iterator<String> left, Iterator<Integer> right) throws Exception { StringBuilder sb = new StringBuilder(key.toString()); while (left.hasNext()) { sb.append(left.next()); } sb.append("#"); while (right.hasNext()) { sb.append(right.next()); } return Collections.singletonList(sb.toString()); } }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("1a#2", "3foobar#6", "5#10"), cogrouped.collectAsList()); }