@SuppressWarnings("serial")
  public static void main(String[] args) throws Exception {

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<Vertex<Long, Double>> pages = getPagesDataSet(env);

    DataSet<Edge<Long, Double>> links = getLinksDataSet(env);

    Graph<Long, Double, Double> network = new Graph<Long, Double, Double>(pages, links, env);

    DataSet<Tuple2<Long, Long>> vertexOutDegrees = network.outDegrees();

    // assign the transition probabilities as the edge weights
    Graph<Long, Double, Double> networkWithWeights =
        network.joinWithEdgesOnSource(
            vertexOutDegrees,
            new MapFunction<Tuple2<Double, Long>, Double>() {
              public Double map(Tuple2<Double, Long> value) {
                return value.f0 / value.f1;
              }
            });

    DataSet<Vertex<Long, Double>> pageRanks =
        networkWithWeights
            .run(new PageRank<Long>(numPages, DAMPENING_FACTOR, maxIterations))
            .getVertices();

    pageRanks.print();

    env.execute();
  }
  @SuppressWarnings("serial")
  public static void main(String[] args) throws Exception {

    if (!parseParameters(args)) {
      return;
    }

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<Edge<Long, NullValue>> edges = getEdgesDataSet(env);

    Graph<Long, Long, NullValue> graph =
        Graph.fromDataSet(
            edges,
            new MapFunction<Long, Long>() {
              @Override
              public Long map(Long value) throws Exception {
                return value;
              }
            },
            env);

    DataSet<Vertex<Long, Long>> verticesWithMinIds =
        graph.run(new GSAConnectedComponents<Long, Long, NullValue>(maxIterations));

    // emit result
    if (fileOutput) {
      verticesWithMinIds.writeAsCsv(outputPath, "\n", ",");

      // since file sinks are lazy, we trigger the execution explicitly
      env.execute("Connected Components Example");
    } else {
      verticesWithMinIds.print();
    }
  }
Beispiel #3
0
  public static void main(String[] args) throws Exception {

    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    System.out.println(
        "Usage: KMeans --points <path> --centroids <path> --output <path> --iterations <n>");

    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.getConfig()
        .setGlobalJobParameters(params); // make parameters available in the web interface

    // get input data:
    // read the points and centroids from the provided paths or fall back to default data
    DataSet<Point> points = getPointDataSet(params, env);
    DataSet<Centroid> centroids = getCentroidDataSet(params, env);

    // set number of bulk iterations for KMeans algorithm
    IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10));

    DataSet<Centroid> newCentroids =
        points
            // compute closest centroid for each point
            .map(new SelectNearestCenter())
            .withBroadcastSet(loop, "centroids")
            // count and sum point coordinates for each centroid
            .map(new CountAppender())
            .groupBy(0)
            .reduce(new CentroidAccumulator())
            // compute new centroids from point counts and coordinate sums
            .map(new CentroidAverager());

    // feed new centroids back into next iteration
    DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);

    DataSet<Tuple2<Integer, Point>> clusteredPoints =
        points
            // assign points to final clusters
            .map(new SelectNearestCenter())
            .withBroadcastSet(finalCentroids, "centroids");

    // emit result
    if (params.has("output")) {
      clusteredPoints.writeAsCsv(params.get("output"), "\n", " ");

      // since file sinks are lazy, we trigger the execution explicitly
      env.execute("KMeans Example");
    } else {
      System.out.println("Printing result to stdout. Use --output to specify output path.");
      clusteredPoints.print();
    }
  }
  @Override
  protected void testProgram() throws Exception {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<Record> initialInput =
        env.readFile(new PointInFormat(), this.dataPath).setParallelism(1);

    IterativeDataSet<Record> iteration = initialInput.iterate(2);

    DataSet<Record> result = iteration.union(iteration).map(new IdentityMapper());

    iteration.closeWith(result).write(new PointOutFormat(), this.resultPath);

    env.execute();
  }
  @Test(expected = ExpressionException.class)
  public void testOnlyFieldRefInAs() throws Exception {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    TableEnvironment tableEnv = new TableEnvironment();

    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);

    Table in = tableEnv.fromDataSet(ds, "a, b as c, d");

    DataSet<Row> resultSet = tableEnv.toDataSet(in, Row.class);
    resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);

    env.execute();

    expected = "sorry bro";
  }
  @Test(expected = ExpressionException.class)
  public void testAsWithAmbiguousFields() throws Exception {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    TableEnvironment tableEnv = new TableEnvironment();

    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);

    Table in = tableEnv.fromDataSet(ds, "a, b, c, b");

    DataSet<Row> resultSet = tableEnv.toDataSet(in, Row.class);
    resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);

    env.execute();

    expected = " today's not your day ";
  }
  @Test
  public void testSimpleSelectAllWithAs() throws Exception {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    TableEnvironment tableEnv = new TableEnvironment();

    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);

    Table in = tableEnv.fromDataSet(ds, "a,b,c");

    Table result = in.select("a, b, c");

    DataSet<Row> resultSet = tableEnv.toDataSet(result, Row.class);
    resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);

    env.execute();

    expected =
        "1,1,Hi\n"
            + "2,2,Hello\n"
            + "3,2,Hello world\n"
            + "4,3,Hello world, "
            + "how are you?\n"
            + "5,3,I am fine.\n"
            + "6,3,Luke Skywalker\n"
            + "7,4,"
            + "Comment#1\n"
            + "8,4,Comment#2\n"
            + "9,4,Comment#3\n"
            + "10,4,Comment#4\n"
            + "11,5,"
            + "Comment#5\n"
            + "12,5,Comment#6\n"
            + "13,5,Comment#7\n"
            + "14,5,Comment#8\n"
            + "15,5,"
            + "Comment#9\n"
            + "16,6,Comment#10\n"
            + "17,6,Comment#11\n"
            + "18,6,Comment#12\n"
            + "19,"
            + "6,Comment#13\n"
            + "20,6,Comment#14\n"
            + "21,6,Comment#15\n";
  }
  public static void main(String[] args) throws Exception {

    final ParameterTool params = ParameterTool.fromArgs(args);

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    System.out.println(
        "Usage: WebLogAnalysis --documents <path> --ranks <path> --visits <path> --output <path>");

    env.getConfig().setGlobalJobParameters(params);

    // get input data
    DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
    DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
    DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);

    // Retain documents with keywords
    DataSet<Tuple1<String>> filterDocs = documents.filter(new FilterDocByKeyWords()).project(0);

    // Filter ranks by minimum rank
    DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks.filter(new FilterByRank());

    // Filter visits by visit date
    DataSet<Tuple1<String>> filterVisits = visits.filter(new FilterVisitsByDate()).project(0);

    // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
    DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
        filterDocs.join(filterRanks).where(0).equalTo(1).projectSecond(0, 1, 2);

    // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain
    // time
    DataSet<Tuple3<Integer, String, Integer>> result =
        joinDocsRanks.coGroup(filterVisits).where(1).equalTo(0).with(new AntiJoinVisits());

    // emit result
    if (params.has("output")) {
      result.writeAsCsv(params.get("output"), "\n", "|");
      // execute program
      env.execute("WebLogAnalysis Example");
    } else {
      System.out.println("Printing result to stdout. Use --output to specify output path.");
      result.print();
    }
  }
  @Override
  protected void testProgram() throws Exception {

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setDegreeOfParallelism(4);

    DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8);

    IterativeDataSet<Integer> iteration = data.iterate(10);

    DataSet<Integer> result =
        data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc");

    final List<Integer> resultList = new ArrayList<Integer>();
    iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList));

    env.execute();

    Assert.assertEquals(8, resultList.get(0).intValue());
  }
  @Test
  public void testSimpleSelectWithNaming() throws Exception {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    TableEnvironment tableEnv = new TableEnvironment();

    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);

    Table in = tableEnv.fromDataSet(ds);

    Table result = in.select("f0 as a, f1 as b").select("a, b");

    DataSet<Row> resultSet = tableEnv.toDataSet(result, Row.class);
    resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);

    env.execute();

    expected =
        "1,1\n" + "2,2\n" + "3,2\n" + "4,3\n" + "5,3\n" + "6,3\n" + "7,4\n" + "8,4\n" + "9,4\n"
            + "10,4\n" + "11,5\n" + "12,5\n" + "13,5\n" + "14,5\n" + "15,5\n" + "16,6\n" + "17,6\n"
            + "18,6\n" + "19,6\n" + "20,6\n" + "21,6\n";
  }
  public static void main(String[] args) throws Exception {

    final String jarFile = args[0];
    final String host = args[1];
    final int port = Integer.parseInt(args[2]);

    ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment(host, port, jarFile);

    DataSet<Integer> data = env.createInput(new CustomInputFormat());

    data.map(
            new MapFunction<Integer, Tuple2<Integer, Double>>() {
              @Override
              public Tuple2<Integer, Double> map(Integer value) {
                return new Tuple2<Integer, Double>(value, value * 0.5);
              }
            })
        .output(new DiscardingOutputFormat<Tuple2<Integer, Double>>());

    env.execute();
  }