public Generator(ParameterTool pt) { this.payload = new byte[pt.getInt("payload")]; this.delay = pt.getInt("delay"); this.withFt = pt.has("ft"); this.latFreq = pt.getInt("latencyFreq"); this.sleepFreq = pt.getInt("sleepFreq"); }
public static void main(String[] args) throws Exception { // Checking input parameters final ParameterTool params = ParameterTool.fromArgs(args); System.out.println( "Usage: KMeans --points <path> --centroids <path> --output <path> --iterations <n>"); // set up execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig() .setGlobalJobParameters(params); // make parameters available in the web interface // get input data: // read the points and centroids from the provided paths or fall back to default data DataSet<Point> points = getPointDataSet(params, env); DataSet<Centroid> centroids = getCentroidDataSet(params, env); // set number of bulk iterations for KMeans algorithm IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10)); DataSet<Centroid> newCentroids = points // compute closest centroid for each point .map(new SelectNearestCenter()) .withBroadcastSet(loop, "centroids") // count and sum point coordinates for each centroid .map(new CountAppender()) .groupBy(0) .reduce(new CentroidAccumulator()) // compute new centroids from point counts and coordinate sums .map(new CentroidAverager()); // feed new centroids back into next iteration DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids); DataSet<Tuple2<Integer, Point>> clusteredPoints = points // assign points to final clusters .map(new SelectNearestCenter()) .withBroadcastSet(finalCentroids, "centroids"); // emit result if (params.has("output")) { clusteredPoints.writeAsCsv(params.get("output"), "\n", " "); // since file sinks are lazy, we trigger the execution explicitly env.execute("KMeans Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); clusteredPoints.print(); } }
private static DataSet<Tuple3<Integer, String, Integer>> getRanksDataSet( ExecutionEnvironment env, ParameterTool params) { // Create DataSet for ranks relation (Rank, URL, Avg-Visit-Duration) if (params.has("ranks")) { return env.readCsvFile(params.get("ranks")) .fieldDelimiter("|") .types(Integer.class, String.class, Integer.class); } else { System.out.println("Executing WebLogAnalysis example with default ranks data set."); System.out.println("Use --ranks to specify file input."); return WebLogData.getRankDataSet(env); } }
private static DataSet<Tuple2<String, String>> getDocumentsDataSet( ExecutionEnvironment env, ParameterTool params) { // Create DataSet for documents relation (URL, Doc-Text) if (params.has("documents")) { return env.readCsvFile(params.get("documents")) .fieldDelimiter("|") .types(String.class, String.class); } else { System.out.println("Executing WebLogAnalysis example with default documents data set."); System.out.println("Use --documents to specify file input."); return WebLogData.getDocumentDataSet(env); } }
private static DataSet<Point> getPointDataSet(ParameterTool params, ExecutionEnvironment env) { DataSet<Point> points; if (params.has("points")) { // read points from CSV file points = env.readCsvFile(params.get("points")).fieldDelimiter(" ").pojoType(Point.class, "x", "y"); } else { System.out.println("Executing K-Means example with default point data set."); System.out.println("Use --points to specify file input."); points = KMeansData.getDefaultPointDataSet(env); } return points; }
private static DataSet<Centroid> getCentroidDataSet( ParameterTool params, ExecutionEnvironment env) { DataSet<Centroid> centroids; if (params.has("centroids")) { centroids = env.readCsvFile(params.get("centroids")) .fieldDelimiter(" ") .pojoType(Centroid.class, "id", "x", "y"); } else { System.out.println("Executing K-Means example with default centroid data set."); System.out.println("Use --centroids to specify file input."); centroids = KMeansData.getDefaultCentroidDataSet(env); } return centroids; }
public static void main(String[] args) throws Exception { final ParameterTool params = ParameterTool.fromArgs(args); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); System.out.println( "Usage: WebLogAnalysis --documents <path> --ranks <path> --visits <path> --output <path>"); env.getConfig().setGlobalJobParameters(params); // get input data DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params); DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params); DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params); // Retain documents with keywords DataSet<Tuple1<String>> filterDocs = documents.filter(new FilterDocByKeyWords()).project(0); // Filter ranks by minimum rank DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks.filter(new FilterByRank()); // Filter visits by visit date DataSet<Tuple1<String>> filterVisits = visits.filter(new FilterVisitsByDate()).project(0); // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks = filterDocs.join(filterRanks).where(0).equalTo(1).projectSecond(0, 1, 2); // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain // time DataSet<Tuple3<Integer, String, Integer>> result = joinDocsRanks.coGroup(filterVisits).where(1).equalTo(0).with(new AntiJoinVisits()); // emit result if (params.has("output")) { result.writeAsCsv(params.get("output"), "\n", "|"); // execute program env.execute("WebLogAnalysis Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); result.print(); } }
public static void main(String[] args) throws Exception { ParameterTool pt = ParameterTool.fromArgs(args); int par = pt.getInt("para"); TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("source0", new Generator(pt), pt.getInt("sourceParallelism")); int i = 0; for (; i < pt.getInt("repartitions", 1) - 1; i++) { System.out.println("adding source" + i + " --> source" + (i + 1)); builder .setBolt("source" + (i + 1), new RepartPassThroughBolt(pt), pt.getInt("sinkParallelism")) .fieldsGrouping("source" + i, new Fields("id")); } System.out.println("adding final source" + i + " --> sink"); builder .setBolt("sink", new Sink(pt), pt.getInt("sinkParallelism")) .fieldsGrouping("source" + i, new Fields("id")); Config conf = new Config(); conf.setDebug(false); // System.exit(1); if (!pt.has("local")) { conf.setNumWorkers(par); StormSubmitter.submitTopologyWithProgressBar( "throughput-" + pt.get("name", "no_name"), conf, builder.createTopology()); } else { conf.setMaxTaskParallelism(par); LocalCluster cluster = new LocalCluster(); cluster.submitTopology("throughput", conf, builder.createTopology()); Thread.sleep(300000); cluster.shutdown(); } }
public Sink(ParameterTool pt) throws UnknownHostException { this.pt = pt; this.withFT = pt.has("ft"); this.logfreq = pt.getInt("logfreq"); this.host = InetAddress.getLocalHost().getHostName(); }
public RepartPassThroughBolt(ParameterTool pt) { this.withFt = pt.has("ft"); }