Пример #1
0
  @Test
  public void testCommonOperation() {
    List<String> data = Arrays.asList("hello", "world");
    Dataset<String> ds = context.createDataset(data, Encoders.STRING());
    Assert.assertEquals("hello", ds.first());

    Dataset<String> filtered =
        ds.filter(
            new FilterFunction<String>() {
              @Override
              public boolean call(String v) throws Exception {
                return v.startsWith("h");
              }
            });
    Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList());

    Dataset<Integer> mapped =
        ds.map(
            new MapFunction<String, Integer>() {
              @Override
              public Integer call(String v) throws Exception {
                return v.length();
              }
            },
            Encoders.INT());
    Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList());

    Dataset<String> parMapped =
        ds.mapPartitions(
            new MapPartitionsFunction<String, String>() {
              @Override
              public Iterable<String> call(Iterator<String> it) throws Exception {
                List<String> ls = new LinkedList<String>();
                while (it.hasNext()) {
                  ls.add(it.next().toUpperCase());
                }
                return ls;
              }
            },
            Encoders.STRING());
    Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList());

    Dataset<String> flatMapped =
        ds.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) throws Exception {
                List<String> ls = new LinkedList<String>();
                for (char c : s.toCharArray()) {
                  ls.add(String.valueOf(c));
                }
                return ls;
              }
            },
            Encoders.STRING());
    Assert.assertEquals(
        Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"),
        flatMapped.collectAsList());
  }
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
      System.exit(1);
    }

    String host = args[0];
    int port = Integer.parseInt(args[1]);

    SparkSession spark =
        SparkSession.builder().appName("JavaStructuredNetworkWordCount").getOrCreate();

    // Create DataFrame representing the stream of input lines from connection to host:port
    Dataset<String> lines =
        spark
            .readStream()
            .format("socket")
            .option("host", host)
            .option("port", port)
            .load()
            .as(Encoders.STRING());

    // Split the lines into words
    Dataset<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterator<String> call(String x) {
                return Arrays.asList(x.split(" ")).iterator();
              }
            },
            Encoders.STRING());

    // Generate running word count
    Dataset<Row> wordCounts = words.groupBy("value").count();

    // Start running the query that prints the running counts to the console
    StreamingQuery query =
        wordCounts.writeStream().outputMode("complete").format("console").start();

    query.awaitTermination();
  }