public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
      System.exit(1);
    }

    String host = args[0];
    int port = Integer.parseInt(args[1]);

    SparkSession spark =
        SparkSession.builder().appName("JavaStructuredNetworkWordCount").getOrCreate();

    // Create DataFrame representing the stream of input lines from connection to host:port
    Dataset<String> lines =
        spark
            .readStream()
            .format("socket")
            .option("host", host)
            .option("port", port)
            .load()
            .as(Encoders.STRING());

    // Split the lines into words
    Dataset<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterator<String> call(String x) {
                return Arrays.asList(x.split(" ")).iterator();
              }
            },
            Encoders.STRING());

    // Generate running word count
    Dataset<Row> wordCounts = words.groupBy("value").count();

    // Start running the query that prints the running counts to the console
    StreamingQuery query =
        wordCounts.writeStream().outputMode("complete").format("console").start();

    query.awaitTermination();
  }
Пример #2
0
  @Test
  public void testGroupByColumn() {
    List<String> data = Arrays.asList("a", "foo", "bar");
    Dataset<String> ds = context.createDataset(data, Encoders.STRING());
    GroupedDataset<Integer, String> grouped =
        ds.groupBy(length(col("value"))).keyAs(Encoders.INT());

    Dataset<String> mapped =
        grouped.mapGroups(
            new MapGroupsFunction<Integer, String, String>() {
              @Override
              public String call(Integer key, Iterator<String> data) throws Exception {
                StringBuilder sb = new StringBuilder(key.toString());
                while (data.hasNext()) {
                  sb.append(data.next());
                }
                return sb.toString();
              }
            },
            Encoders.STRING());

    Assert.assertEquals(Arrays.asList("1a", "3foobar"), mapped.collectAsList());
  }
Пример #3
0
  @Test
  public void testGroupBy() {
    List<String> data = Arrays.asList("a", "foo", "bar");
    Dataset<String> ds = context.createDataset(data, Encoders.STRING());
    GroupedDataset<Integer, String> grouped =
        ds.groupBy(
            new MapFunction<String, Integer>() {
              @Override
              public Integer call(String v) throws Exception {
                return v.length();
              }
            },
            Encoders.INT());

    Dataset<String> mapped =
        grouped.mapGroups(
            new MapGroupsFunction<Integer, String, String>() {
              @Override
              public String call(Integer key, Iterator<String> values) throws Exception {
                StringBuilder sb = new StringBuilder(key.toString());
                while (values.hasNext()) {
                  sb.append(values.next());
                }
                return sb.toString();
              }
            },
            Encoders.STRING());

    Assert.assertEquals(Arrays.asList("1a", "3foobar"), mapped.collectAsList());

    Dataset<String> flatMapped =
        grouped.flatMapGroups(
            new FlatMapGroupsFunction<Integer, String, String>() {
              @Override
              public Iterable<String> call(Integer key, Iterator<String> values) throws Exception {
                StringBuilder sb = new StringBuilder(key.toString());
                while (values.hasNext()) {
                  sb.append(values.next());
                }
                return Collections.singletonList(sb.toString());
              }
            },
            Encoders.STRING());

    Assert.assertEquals(Arrays.asList("1a", "3foobar"), flatMapped.collectAsList());

    Dataset<Tuple2<Integer, String>> reduced =
        grouped.reduce(
            new ReduceFunction<String>() {
              @Override
              public String call(String v1, String v2) throws Exception {
                return v1 + v2;
              }
            });

    Assert.assertEquals(
        Arrays.asList(tuple2(1, "a"), tuple2(3, "foobar")), reduced.collectAsList());

    List<Integer> data2 = Arrays.asList(2, 6, 10);
    Dataset<Integer> ds2 = context.createDataset(data2, Encoders.INT());
    GroupedDataset<Integer, Integer> grouped2 =
        ds2.groupBy(
            new MapFunction<Integer, Integer>() {
              @Override
              public Integer call(Integer v) throws Exception {
                return v / 2;
              }
            },
            Encoders.INT());

    Dataset<String> cogrouped =
        grouped.cogroup(
            grouped2,
            new CoGroupFunction<Integer, String, Integer, String>() {
              @Override
              public Iterable<String> call(
                  Integer key, Iterator<String> left, Iterator<Integer> right) throws Exception {
                StringBuilder sb = new StringBuilder(key.toString());
                while (left.hasNext()) {
                  sb.append(left.next());
                }
                sb.append("#");
                while (right.hasNext()) {
                  sb.append(right.next());
                }
                return Collections.singletonList(sb.toString());
              }
            },
            Encoders.STRING());

    Assert.assertEquals(Arrays.asList("1a#2", "3foobar#6", "5#10"), cogrouped.collectAsList());
  }