private static DataSet<Tuple3<Integer, String, Integer>> getRanksDataSet( ExecutionEnvironment env, ParameterTool params) { // Create DataSet for ranks relation (Rank, URL, Avg-Visit-Duration) if (params.has("ranks")) { return env.readCsvFile(params.get("ranks")) .fieldDelimiter("|") .types(Integer.class, String.class, Integer.class); } else { System.out.println("Executing WebLogAnalysis example with default ranks data set."); System.out.println("Use --ranks to specify file input."); return WebLogData.getRankDataSet(env); } }
private static DataSet<Tuple2<String, String>> getDocumentsDataSet( ExecutionEnvironment env, ParameterTool params) { // Create DataSet for documents relation (URL, Doc-Text) if (params.has("documents")) { return env.readCsvFile(params.get("documents")) .fieldDelimiter("|") .types(String.class, String.class); } else { System.out.println("Executing WebLogAnalysis example with default documents data set."); System.out.println("Use --documents to specify file input."); return WebLogData.getDocumentDataSet(env); } }
private static DataSet<Point> getPointDataSet(ParameterTool params, ExecutionEnvironment env) { DataSet<Point> points; if (params.has("points")) { // read points from CSV file points = env.readCsvFile(params.get("points")).fieldDelimiter(" ").pojoType(Point.class, "x", "y"); } else { System.out.println("Executing K-Means example with default point data set."); System.out.println("Use --points to specify file input."); points = KMeansData.getDefaultPointDataSet(env); } return points; }
private Plan getTestPlanRightStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSet<Tuple3<Long, Long, Long>> bigInput = env.readCsvFile("file://bigFile").types(Long.class, Long.class, Long.class).name("bigFile"); DataSet<Tuple3<Long, Long, Long>> smallInput = env.readCsvFile("file://smallFile") .types(Long.class, Long.class, Long.class) .name("smallFile"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString( Optimizer.HINT_SHIP_STRATEGY, Optimizer.HINT_SHIP_STRATEGY_REPARTITION_HASH); if (!strategy.equals("")) { joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); } DataSet<Tuple3<Long, Long, Long>> inner = iteration .join(smallInput) .where(0) .equalTo(0) .with(new DummyJoiner()) .name("DummyJoiner") .withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long, Long, Long>>()); return env.createProgramPlan(); }
private static DataSet<Centroid> getCentroidDataSet( ParameterTool params, ExecutionEnvironment env) { DataSet<Centroid> centroids; if (params.has("centroids")) { centroids = env.readCsvFile(params.get("centroids")) .fieldDelimiter(" ") .pojoType(Centroid.class, "id", "x", "y"); } else { System.out.println("Executing K-Means example with default centroid data set."); System.out.println("Use --centroids to specify file input."); centroids = KMeansData.getDefaultCentroidDataSet(env); } return centroids; }
@SuppressWarnings("serial") private static DataSet<Edge<Long, NullValue>> getEdgesDataSet(ExecutionEnvironment env) { if (fileOutput) { return env.readCsvFile(edgeInputPath) .ignoreComments("#") .fieldDelimiter("\t") .lineDelimiter("\n") .types(Long.class, Long.class) .map( new MapFunction<Tuple2<Long, Long>, Edge<Long, NullValue>>() { @Override public Edge<Long, NullValue> map(Tuple2<Long, Long> value) throws Exception { return new Edge<>(value.f0, value.f1, NullValue.getInstance()); } }); } else { return ConnectedComponentsDefaultData.getDefaultEdgeDataSet(env); } }