@Test public void testIdentityMapWithBasicType() throws Exception { /* * Test identity map with basic type */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> ds = CollectionDataSets.getStringDataSet(env); DataSet<String> identityMapDs = ds.map(new Mapper1()); List<String> result = identityMapDs.collect(); String expected = "Hi\n" + "Hello\n" + "Hello world\n" + "Hello world, how are you?\n" + "I am fine.\n" + "Luke Skywalker\n" + "Random comment\n" + "LOL\n"; compareResultAsText(result, expected); }
@Test public void testCrossLambda() { try { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, String>> left = env.fromElements( new Tuple2<Integer, String>(1, "hello"), new Tuple2<Integer, String>(2, "what's"), new Tuple2<Integer, String>(2, "up")); DataSet<Tuple2<Integer, String>> right = env.fromElements( new Tuple2<Integer, String>(1, "not"), new Tuple2<Integer, String>(1, "much"), new Tuple2<Integer, String>(2, "really")); DataSet<Tuple2<Integer, String>> joined = left.cross(right) .with((t, s) -> new Tuple2<Integer, String>(t.f0 + s.f0, t.f1 + " " + s.f1)); } catch (UnsupportedLambdaExpressionException e) { // Success return; } catch (Exception e) { Assert.fail(); } }
@SuppressWarnings("serial") @Test public void testRun() throws Exception { int verticesCount = 5000; int edgesCount = verticesCount * 2; ExecutionEnvironment environment = ExecutionEnvironment.getExecutionEnvironment(); environment.getConfig().disableSysoutLogging(); Graph<Long, Long, Long> graph = GraphGenerator.generateGraph(verticesCount, edgesCount, environment); PCConnectedComponents<Long, Long> algo = new PCConnectedComponents<>(verticesCount); List<Tuple2<Long, Long>> result = algo.run(graph) .map( new RichMapFunction<Vertex<Long, Long>, Tuple2<Long, Long>>() { @Override public Tuple2<Long, Long> map(Vertex<Long, Long> value) throws Exception { return new Tuple2<>(value.getId(), value.getValue()); } }) .collect(); ConnectedComponentsData.checkOddEvenResult(result); }
@Test public void testAggregationTypes() { try { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> tupleDs = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work: multiple aggregates tupleDs.aggregate(Aggregations.SUM, 0).and(Aggregations.MIN, 4); // should work: nested aggregates tupleDs.aggregate(Aggregations.MIN, 2).aggregate(Aggregations.SUM, 1); // should not work: average on string try { tupleDs.aggregate(Aggregations.SUM, 2); Assert.fail(); } catch (UnsupportedAggregationTypeException iae) { // we're good here } } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); Assert.fail(e.getMessage()); } }
@SuppressWarnings("serial") public static void main(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Vertex<Long, Double>> pages = getPagesDataSet(env); DataSet<Edge<Long, Double>> links = getLinksDataSet(env); Graph<Long, Double, Double> network = new Graph<Long, Double, Double>(pages, links, env); DataSet<Tuple2<Long, Long>> vertexOutDegrees = network.outDegrees(); // assign the transition probabilities as the edge weights Graph<Long, Double, Double> networkWithWeights = network.joinWithEdgesOnSource( vertexOutDegrees, new MapFunction<Tuple2<Double, Long>, Double>() { public Double map(Tuple2<Double, Long> value) { return value.f0 / value.f1; } }); DataSet<Vertex<Long, Double>> pageRanks = networkWithWeights .run(new PageRank<Long>(numPages, DAMPENING_FACTOR, maxIterations)) .getVertices(); pageRanks.print(); env.execute(); }
@Test public void testJoinWithTuples() { try { final Partitioner<Long> partitioner = new TestPartitionerLong(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Long, Long>> input1 = env.fromElements(new Tuple2<Long, Long>(0L, 0L)); DataSet<Tuple3<Long, Long, Long>> input2 = env.fromElements(new Tuple3<Long, Long, Long>(0L, 0L, 0L)); input1 .join(input2, JoinHint.REPARTITION_HASH_FIRST) .where(1) .equalTo(0) .withPartitioner(partitioner) .print(); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); DualInputPlanNode join = (DualInputPlanNode) sink.getInput().getSource(); assertEquals(ShipStrategyType.PARTITION_CUSTOM, join.getInput1().getShipStrategy()); assertEquals(ShipStrategyType.PARTITION_CUSTOM, join.getInput2().getShipStrategy()); assertEquals(partitioner, join.getInput1().getPartitioner()); assertEquals(partitioner, join.getInput2().getPartitioner()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@SuppressWarnings("serial") public static void main(String[] args) throws Exception { if (!parseParameters(args)) { return; } ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Edge<Long, NullValue>> edges = getEdgesDataSet(env); Graph<Long, Long, NullValue> graph = Graph.fromDataSet( edges, new MapFunction<Long, Long>() { @Override public Long map(Long value) throws Exception { return value; } }, env); DataSet<Vertex<Long, Long>> verticesWithMinIds = graph.run(new GSAConnectedComponents<Long, Long, NullValue>(maxIterations)); // emit result if (fileOutput) { verticesWithMinIds.writeAsCsv(outputPath, "\n", ","); // since file sinks are lazy, we trigger the execution explicitly env.execute("Connected Components Example"); } else { verticesWithMinIds.print(); } }
@Test public void testDistinctPreservesPartitioningOfDistinctFields() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); @SuppressWarnings("unchecked") DataSet<Tuple2<Long, Long>> data = env.fromElements(new Tuple2<Long, Long>(0L, 0L), new Tuple2<Long, Long>(1L, 1L)) .map(new IdentityMapper<Tuple2<Long, Long>>()) .setParallelism(4); data.distinct(0).groupBy(0).sum(1).output(new DiscardingOutputFormat<Tuple2<Long, Long>>()); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); SingleInputPlanNode reducer = (SingleInputPlanNode) sink.getInput().getSource(); SingleInputPlanNode distinctReducer = (SingleInputPlanNode) reducer.getInput().getSource(); assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy()); // reducer can be forward, reuses partitioning from distinct assertEquals(ShipStrategyType.FORWARD, reducer.getInput().getShipStrategy()); // distinct reducer is partitioned assertEquals(ShipStrategyType.PARTITION_HASH, distinctReducer.getInput().getShipStrategy()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testJoinWithKeySelectorsWrongType() { try { final Partitioner<Long> partitioner = new TestPartitionerLong(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Pojo2> input1 = env.fromElements(new Pojo2()); DataSet<Pojo3> input2 = env.fromElements(new Pojo3()); try { input1 .join(input2, JoinHint.REPARTITION_HASH_FIRST) .where(new Pojo2KeySelector()) .equalTo(new Pojo3KeySelector()) .withPartitioner(partitioner); fail("should throw an exception"); } catch (InvalidProgramException e) { // expected } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testRejectWhenSolutionSetKeysDontMatchJoin() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); @SuppressWarnings("unchecked") DataSet<Tuple3<Double, Long, String>> initialSolutionSet = env.fromElements(new Tuple3<Double, Long, String>(3.44, 5L, "abc")); @SuppressWarnings("unchecked") DataSet<Tuple2<Double, String>> initialWorkSet = env.fromElements(new Tuple2<Double, String>(1.23, "abc")); DeltaIteration<Tuple3<Double, Long, String>, Tuple2<Double, String>> iteration = initialSolutionSet.iterateDelta(initialWorkSet, 10, 1); try { iteration.getWorkset().join(iteration.getSolutionSet()).where(1).equalTo(2); fail("Accepted invalid program."); } catch (InvalidProgramException e) { // all good! } try { iteration.getSolutionSet().join(iteration.getWorkset()).where(2).equalTo(1); fail("Accepted invalid program."); } catch (InvalidProgramException e) { // all good! } } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testJoinWithTuplesWrongType() { try { final Partitioner<Integer> partitioner = new TestPartitionerInt(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Long, Long>> input1 = env.fromElements(new Tuple2<Long, Long>(0L, 0L)); DataSet<Tuple3<Long, Long, Long>> input2 = env.fromElements(new Tuple3<Long, Long, Long>(0L, 0L, 0L)); try { input1 .join(input2, JoinHint.REPARTITION_HASH_FIRST) .where(1) .equalTo(0) .withPartitioner(partitioner); fail("should throw an exception"); } catch (InvalidProgramException e) { // expected } } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testFieldsAggregate() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> tupleDs = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work try { tupleDs.aggregate(Aggregations.SUM, 1); } catch (Exception e) { Assert.fail(); } // should not work: index out of bounds try { tupleDs.aggregate(Aggregations.SUM, 10); Assert.fail(); } catch (IllegalArgumentException iae) { // we're good here } catch (Exception e) { Assert.fail(); } // should not work: not applied to tuple dataset DataSet<Long> longDs = env.fromCollection(emptyLongData, BasicTypeInfo.LONG_TYPE_INFO); try { longDs.aggregate(Aggregations.MIN, 1); Assert.fail(); } catch (InvalidProgramException uoe) { // we're good here } catch (Exception e) { Assert.fail(); } }
@Test public void testJoinProjection6() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should work try { ds1.join(ds2) .where( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } }) .equalTo( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } }) .projectFirst() .projectSecond() .types(CustomType.class, CustomType.class); } catch (Exception e) { System.out.println("FAILED: " + e); Assert.fail(); } }
@Test public void testJoinKeySelectors1() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should work try { ds1.join(ds2) .where( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } }) .equalTo( new KeySelector<CustomType, Long>() { @Override public Long getKey(CustomType value) { return value.myLong; } }); } catch (Exception e) { Assert.fail(); } }
@Test(expected = IllegalArgumentException.class) public void testJoinKeyExpressions4() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should not work, join key non-existent ds1.join(ds2).where("myNonExistent").equalTo("myInt"); }
@Test(expected = InvalidProgramException.class) public void testJoinKeyExpressions3() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should not work, incompatible number of join keys ds1.join(ds2).where("myInt", "myString").equalTo("myString"); }
private void testRightOuterStrategies(JoinHint hint) { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work ds1.rightOuterJoin(ds2, hint).where(0).equalTo(4).with(new DummyJoin()); }
@Test public void testRightOuter4() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should work ds1.rightOuterJoin(ds2).where(0).equalTo(new IntKeySelector()).with(new DummyJoin()); }
@Test(expected = IllegalArgumentException.class) public void testJoinKeyFields6() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should not work, join key fields on custom type ds1.join(ds2).where(5).equalTo(0); }
@Test(expected = CompositeType.InvalidFieldReferenceException.class) public void testRightOuter8() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // invalid key reference ds1.rightOuterJoin(ds2).where(1).equalTo("f5").with(new DummyJoin()); }
@Test(expected = IllegalArgumentException.class) public void testRightOuter7() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // invalid key position ds1.rightOuterJoin(ds2).where(5).equalTo(0).with(new DummyJoin()); }
@Test(expected = InvalidProgramException.class) public void testRightOuter9() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // key types do not match ds1.rightOuterJoin(ds2).where(0).equalTo(1).with(new DummyJoin()); }
@Test(expected = IllegalArgumentException.class) public void testJoinKeyFields5() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should not work, negative key field position ds1.join(ds2).where(-1).equalTo(-1); }
@Test(expected = IllegalArgumentException.class) public void testJoinProjection12() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should not work, number of types and fields does not match ds1.join(ds2).where(0).equalTo(0).projectSecond(2).projectFirst(1).types(String.class); }
@Test(expected = IndexOutOfBoundsException.class) public void testJoinProjection14() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should not work, index out of range ds1.join(ds2).where(0).equalTo(0).projectFirst(0).projectSecond(5).types(Integer.class); }
@Test(expected = InvalidProgramException.class) public void testJoinKeyFields3() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds1 = env.fromCollection(emptyTupleData, tupleTypeInfo); DataSet<Tuple5<Integer, Long, String, Long, Integer>> ds2 = env.fromCollection(emptyTupleData, tupleTypeInfo); // should not work, incompatible number of join keys ds1.join(ds2).where(0, 1).equalTo(2); }
public static void main(String[] args) throws Exception { // Checking input parameters final ParameterTool params = ParameterTool.fromArgs(args); System.out.println( "Usage: KMeans --points <path> --centroids <path> --output <path> --iterations <n>"); // set up execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig() .setGlobalJobParameters(params); // make parameters available in the web interface // get input data: // read the points and centroids from the provided paths or fall back to default data DataSet<Point> points = getPointDataSet(params, env); DataSet<Centroid> centroids = getCentroidDataSet(params, env); // set number of bulk iterations for KMeans algorithm IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10)); DataSet<Centroid> newCentroids = points // compute closest centroid for each point .map(new SelectNearestCenter()) .withBroadcastSet(loop, "centroids") // count and sum point coordinates for each centroid .map(new CountAppender()) .groupBy(0) .reduce(new CentroidAccumulator()) // compute new centroids from point counts and coordinate sums .map(new CentroidAverager()); // feed new centroids back into next iteration DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids); DataSet<Tuple2<Integer, Point>> clusteredPoints = points // assign points to final clusters .map(new SelectNearestCenter()) .withBroadcastSet(finalCentroids, "centroids"); // emit result if (params.has("output")) { clusteredPoints.writeAsCsv(params.get("output"), "\n", " "); // since file sinks are lazy, we trigger the execution explicitly env.execute("KMeans Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); clusteredPoints.print(); } }
@Test public void testCustomPartitioningNotReused() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); @SuppressWarnings("unchecked") DataSet<Tuple3<Long, Long, Long>> input = env.fromElements(new Tuple3<Long, Long, Long>(0L, 0L, 0L)); input .partitionCustom( new Partitioner<Long>() { @Override public int partition(Long key, int numPartitions) { return 0; } }, 0) .map(new IdentityMapper<Tuple3<Long, Long, Long>>()) .withForwardedFields("0", "1", "2") .groupBy(0, 1) .reduceGroup(new IdentityGroupReducerCombinable<Tuple3<Long, Long, Long>>()) .withForwardedFields("0", "1", "2") .groupBy(1) .reduceGroup(new IdentityGroupReducerCombinable<Tuple3<Long, Long, Long>>()) .output(new DiscardingOutputFormat<Tuple3<Long, Long, Long>>()); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); SingleInputPlanNode reducer2 = (SingleInputPlanNode) sink.getInput().getSource(); SingleInputPlanNode combiner = (SingleInputPlanNode) reducer2.getInput().getSource(); SingleInputPlanNode reducer1 = (SingleInputPlanNode) combiner.getInput().getSource(); assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy()); // should be locally forwarding, reusing sort and partitioning assertEquals(ShipStrategyType.PARTITION_HASH, reducer2.getInput().getShipStrategy()); assertEquals(LocalStrategy.COMBININGSORT, reducer2.getInput().getLocalStrategy()); assertEquals(ShipStrategyType.FORWARD, combiner.getInput().getShipStrategy()); assertEquals(LocalStrategy.NONE, combiner.getInput().getLocalStrategy()); assertEquals(ShipStrategyType.FORWARD, reducer1.getInput().getShipStrategy()); assertEquals(LocalStrategy.COMBININGSORT, reducer1.getInput().getLocalStrategy()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testIncompatibleHashAndCustomPartitioning() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Long, Long, Long>> input = env.fromElements(new Tuple3<Long, Long, Long>(0L, 0L, 0L)); DataSet<Tuple3<Long, Long, Long>> partitioned = input .partitionCustom( new Partitioner<Long>() { @Override public int partition(Long key, int numPartitions) { return 0; } }, 0) .map(new IdentityMapper<Tuple3<Long, Long, Long>>()) .withForwardedFields("0", "1", "2"); DataSet<Tuple3<Long, Long, Long>> grouped = partitioned .distinct(0, 1) .groupBy(1) .sortGroup(0, Order.ASCENDING) .reduceGroup(new IdentityGroupReducer<Tuple3<Long, Long, Long>>()) .withForwardedFields("0", "1"); grouped .join(partitioned, JoinHint.REPARTITION_HASH_FIRST) .where(0) .equalTo(0) .with(new DummyFlatJoinFunction<Tuple3<Long, Long, Long>>()) .print(); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); DualInputPlanNode coGroup = (DualInputPlanNode) sink.getInput().getSource(); assertEquals(ShipStrategyType.PARTITION_HASH, coGroup.getInput1().getShipStrategy()); assertTrue( coGroup.getInput2().getShipStrategy() == ShipStrategyType.PARTITION_HASH || coGroup.getInput2().getShipStrategy() == ShipStrategyType.FORWARD); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
@Test public void testJoinKeyExpressions1() { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds1 = env.fromCollection(customTypeData); DataSet<CustomType> ds2 = env.fromCollection(customTypeData); // should work try { ds1.join(ds2).where("myInt").equalTo("myInt"); } catch (Exception e) { Assert.fail(); } }