@SuppressWarnings("serial") public static void main(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Vertex<Long, Double>> pages = getPagesDataSet(env); DataSet<Edge<Long, Double>> links = getLinksDataSet(env); Graph<Long, Double, Double> network = new Graph<Long, Double, Double>(pages, links, env); DataSet<Tuple2<Long, Long>> vertexOutDegrees = network.outDegrees(); // assign the transition probabilities as the edge weights Graph<Long, Double, Double> networkWithWeights = network.joinWithEdgesOnSource( vertexOutDegrees, new MapFunction<Tuple2<Double, Long>, Double>() { public Double map(Tuple2<Double, Long> value) { return value.f0 / value.f1; } }); DataSet<Vertex<Long, Double>> pageRanks = networkWithWeights .run(new PageRank<Long>(numPages, DAMPENING_FACTOR, maxIterations)) .getVertices(); pageRanks.print(); env.execute(); }
@SuppressWarnings("serial") public static void main(String[] args) throws Exception { if (!parseParameters(args)) { return; } ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Edge<Long, NullValue>> edges = getEdgesDataSet(env); Graph<Long, Long, NullValue> graph = Graph.fromDataSet( edges, new MapFunction<Long, Long>() { @Override public Long map(Long value) throws Exception { return value; } }, env); DataSet<Vertex<Long, Long>> verticesWithMinIds = graph.run(new GSAConnectedComponents<Long, Long, NullValue>(maxIterations)); // emit result if (fileOutput) { verticesWithMinIds.writeAsCsv(outputPath, "\n", ","); // since file sinks are lazy, we trigger the execution explicitly env.execute("Connected Components Example"); } else { verticesWithMinIds.print(); } }
public static void main(String[] args) throws Exception { // Checking input parameters final ParameterTool params = ParameterTool.fromArgs(args); System.out.println( "Usage: KMeans --points <path> --centroids <path> --output <path> --iterations <n>"); // set up execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig() .setGlobalJobParameters(params); // make parameters available in the web interface // get input data: // read the points and centroids from the provided paths or fall back to default data DataSet<Point> points = getPointDataSet(params, env); DataSet<Centroid> centroids = getCentroidDataSet(params, env); // set number of bulk iterations for KMeans algorithm IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10)); DataSet<Centroid> newCentroids = points // compute closest centroid for each point .map(new SelectNearestCenter()) .withBroadcastSet(loop, "centroids") // count and sum point coordinates for each centroid .map(new CountAppender()) .groupBy(0) .reduce(new CentroidAccumulator()) // compute new centroids from point counts and coordinate sums .map(new CentroidAverager()); // feed new centroids back into next iteration DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids); DataSet<Tuple2<Integer, Point>> clusteredPoints = points // assign points to final clusters .map(new SelectNearestCenter()) .withBroadcastSet(finalCentroids, "centroids"); // emit result if (params.has("output")) { clusteredPoints.writeAsCsv(params.get("output"), "\n", " "); // since file sinks are lazy, we trigger the execution explicitly env.execute("KMeans Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); clusteredPoints.print(); } }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Record> initialInput = env.readFile(new PointInFormat(), this.dataPath).setParallelism(1); IterativeDataSet<Record> iteration = initialInput.iterate(2); DataSet<Record> result = iteration.union(iteration).map(new IdentityMapper()); iteration.closeWith(result).write(new PointOutFormat(), this.resultPath); env.execute(); }
@Test(expected = ExpressionException.class) public void testOnlyFieldRefInAs() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); TableEnvironment tableEnv = new TableEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); Table in = tableEnv.fromDataSet(ds, "a, b as c, d"); DataSet<Row> resultSet = tableEnv.toDataSet(in, Row.class); resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE); env.execute(); expected = "sorry bro"; }
@Test(expected = ExpressionException.class) public void testAsWithAmbiguousFields() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); TableEnvironment tableEnv = new TableEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); Table in = tableEnv.fromDataSet(ds, "a, b, c, b"); DataSet<Row> resultSet = tableEnv.toDataSet(in, Row.class); resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE); env.execute(); expected = " today's not your day "; }
@Test public void testSimpleSelectAllWithAs() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); TableEnvironment tableEnv = new TableEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); Table in = tableEnv.fromDataSet(ds, "a,b,c"); Table result = in.select("a, b, c"); DataSet<Row> resultSet = tableEnv.toDataSet(result, Row.class); resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE); env.execute(); expected = "1,1,Hi\n" + "2,2,Hello\n" + "3,2,Hello world\n" + "4,3,Hello world, " + "how are you?\n" + "5,3,I am fine.\n" + "6,3,Luke Skywalker\n" + "7,4," + "Comment#1\n" + "8,4,Comment#2\n" + "9,4,Comment#3\n" + "10,4,Comment#4\n" + "11,5," + "Comment#5\n" + "12,5,Comment#6\n" + "13,5,Comment#7\n" + "14,5,Comment#8\n" + "15,5," + "Comment#9\n" + "16,6,Comment#10\n" + "17,6,Comment#11\n" + "18,6,Comment#12\n" + "19," + "6,Comment#13\n" + "20,6,Comment#14\n" + "21,6,Comment#15\n"; }
public static void main(String[] args) throws Exception { final ParameterTool params = ParameterTool.fromArgs(args); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); System.out.println( "Usage: WebLogAnalysis --documents <path> --ranks <path> --visits <path> --output <path>"); env.getConfig().setGlobalJobParameters(params); // get input data DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params); DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params); DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params); // Retain documents with keywords DataSet<Tuple1<String>> filterDocs = documents.filter(new FilterDocByKeyWords()).project(0); // Filter ranks by minimum rank DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks.filter(new FilterByRank()); // Filter visits by visit date DataSet<Tuple1<String>> filterVisits = visits.filter(new FilterVisitsByDate()).project(0); // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks = filterDocs.join(filterRanks).where(0).equalTo(1).projectSecond(0, 1, 2); // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain // time DataSet<Tuple3<Integer, String, Integer>> result = joinDocsRanks.coGroup(filterVisits).where(1).equalTo(0).with(new AntiJoinVisits()); // emit result if (params.has("output")) { result.writeAsCsv(params.get("output"), "\n", "|"); // execute program env.execute("WebLogAnalysis Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); result.print(); } }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setDegreeOfParallelism(4); DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8); IterativeDataSet<Integer> iteration = data.iterate(10); DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc"); final List<Integer> resultList = new ArrayList<Integer>(); iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList)); env.execute(); Assert.assertEquals(8, resultList.get(0).intValue()); }
@Test public void testSimpleSelectWithNaming() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); TableEnvironment tableEnv = new TableEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); Table in = tableEnv.fromDataSet(ds); Table result = in.select("f0 as a, f1 as b").select("a, b"); DataSet<Row> resultSet = tableEnv.toDataSet(result, Row.class); resultSet.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE); env.execute(); expected = "1,1\n" + "2,2\n" + "3,2\n" + "4,3\n" + "5,3\n" + "6,3\n" + "7,4\n" + "8,4\n" + "9,4\n" + "10,4\n" + "11,5\n" + "12,5\n" + "13,5\n" + "14,5\n" + "15,5\n" + "16,6\n" + "17,6\n" + "18,6\n" + "19,6\n" + "20,6\n" + "21,6\n"; }
public static void main(String[] args) throws Exception { final String jarFile = args[0]; final String host = args[1]; final int port = Integer.parseInt(args[2]); ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment(host, port, jarFile); DataSet<Integer> data = env.createInput(new CustomInputFormat()); data.map( new MapFunction<Integer, Tuple2<Integer, Double>>() { @Override public Tuple2<Integer, Double> map(Integer value) { return new Tuple2<Integer, Double>(value, value * 0.5); } }) .output(new DiscardingOutputFormat<Tuple2<Integer, Double>>()); env.execute(); }