Esempio n. 1
0
  public static void main(String[] args) throws Exception {

    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    System.out.println(
        "Usage: KMeans --points <path> --centroids <path> --output <path> --iterations <n>");

    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.getConfig()
        .setGlobalJobParameters(params); // make parameters available in the web interface

    // get input data:
    // read the points and centroids from the provided paths or fall back to default data
    DataSet<Point> points = getPointDataSet(params, env);
    DataSet<Centroid> centroids = getCentroidDataSet(params, env);

    // set number of bulk iterations for KMeans algorithm
    IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10));

    DataSet<Centroid> newCentroids =
        points
            // compute closest centroid for each point
            .map(new SelectNearestCenter())
            .withBroadcastSet(loop, "centroids")
            // count and sum point coordinates for each centroid
            .map(new CountAppender())
            .groupBy(0)
            .reduce(new CentroidAccumulator())
            // compute new centroids from point counts and coordinate sums
            .map(new CentroidAverager());

    // feed new centroids back into next iteration
    DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);

    DataSet<Tuple2<Integer, Point>> clusteredPoints =
        points
            // assign points to final clusters
            .map(new SelectNearestCenter())
            .withBroadcastSet(finalCentroids, "centroids");

    // emit result
    if (params.has("output")) {
      clusteredPoints.writeAsCsv(params.get("output"), "\n", " ");

      // since file sinks are lazy, we trigger the execution explicitly
      env.execute("KMeans Example");
    } else {
      System.out.println("Printing result to stdout. Use --output to specify output path.");
      clusteredPoints.print();
    }
  }
 private static DataSet<Tuple3<Integer, String, Integer>> getRanksDataSet(
     ExecutionEnvironment env, ParameterTool params) {
   // Create DataSet for ranks relation (Rank, URL, Avg-Visit-Duration)
   if (params.has("ranks")) {
     return env.readCsvFile(params.get("ranks"))
         .fieldDelimiter("|")
         .types(Integer.class, String.class, Integer.class);
   } else {
     System.out.println("Executing WebLogAnalysis example with default ranks data set.");
     System.out.println("Use --ranks to specify file input.");
     return WebLogData.getRankDataSet(env);
   }
 }
 private static DataSet<Tuple2<String, String>> getDocumentsDataSet(
     ExecutionEnvironment env, ParameterTool params) {
   // Create DataSet for documents relation (URL, Doc-Text)
   if (params.has("documents")) {
     return env.readCsvFile(params.get("documents"))
         .fieldDelimiter("|")
         .types(String.class, String.class);
   } else {
     System.out.println("Executing WebLogAnalysis example with default documents data set.");
     System.out.println("Use --documents to specify file input.");
     return WebLogData.getDocumentDataSet(env);
   }
 }
Esempio n. 4
0
 private static DataSet<Point> getPointDataSet(ParameterTool params, ExecutionEnvironment env) {
   DataSet<Point> points;
   if (params.has("points")) {
     // read points from CSV file
     points =
         env.readCsvFile(params.get("points")).fieldDelimiter(" ").pojoType(Point.class, "x", "y");
   } else {
     System.out.println("Executing K-Means example with default point data set.");
     System.out.println("Use --points to specify file input.");
     points = KMeansData.getDefaultPointDataSet(env);
   }
   return points;
 }
Esempio n. 5
0
 private static DataSet<Centroid> getCentroidDataSet(
     ParameterTool params, ExecutionEnvironment env) {
   DataSet<Centroid> centroids;
   if (params.has("centroids")) {
     centroids =
         env.readCsvFile(params.get("centroids"))
             .fieldDelimiter(" ")
             .pojoType(Centroid.class, "id", "x", "y");
   } else {
     System.out.println("Executing K-Means example with default centroid data set.");
     System.out.println("Use --centroids to specify file input.");
     centroids = KMeansData.getDefaultCentroidDataSet(env);
   }
   return centroids;
 }
  public static void main(String[] args) throws Exception {

    final ParameterTool params = ParameterTool.fromArgs(args);

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    System.out.println(
        "Usage: WebLogAnalysis --documents <path> --ranks <path> --visits <path> --output <path>");

    env.getConfig().setGlobalJobParameters(params);

    // get input data
    DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
    DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
    DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);

    // Retain documents with keywords
    DataSet<Tuple1<String>> filterDocs = documents.filter(new FilterDocByKeyWords()).project(0);

    // Filter ranks by minimum rank
    DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks.filter(new FilterByRank());

    // Filter visits by visit date
    DataSet<Tuple1<String>> filterVisits = visits.filter(new FilterVisitsByDate()).project(0);

    // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
    DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
        filterDocs.join(filterRanks).where(0).equalTo(1).projectSecond(0, 1, 2);

    // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain
    // time
    DataSet<Tuple3<Integer, String, Integer>> result =
        joinDocsRanks.coGroup(filterVisits).where(1).equalTo(0).with(new AntiJoinVisits());

    // emit result
    if (params.has("output")) {
      result.writeAsCsv(params.get("output"), "\n", "|");
      // execute program
      env.execute("WebLogAnalysis Example");
    } else {
      System.out.println("Printing result to stdout. Use --output to specify output path.");
      result.print();
    }
  }
  public static void main(String[] args) throws Exception {
    ParameterTool pt = ParameterTool.fromArgs(args);

    int par = pt.getInt("para");

    TopologyBuilder builder = new TopologyBuilder();

    builder.setSpout("source0", new Generator(pt), pt.getInt("sourceParallelism"));
    int i = 0;
    for (; i < pt.getInt("repartitions", 1) - 1; i++) {
      System.out.println("adding source" + i + " --> source" + (i + 1));
      builder
          .setBolt("source" + (i + 1), new RepartPassThroughBolt(pt), pt.getInt("sinkParallelism"))
          .fieldsGrouping("source" + i, new Fields("id"));
    }
    System.out.println("adding final source" + i + " --> sink");

    builder
        .setBolt("sink", new Sink(pt), pt.getInt("sinkParallelism"))
        .fieldsGrouping("source" + i, new Fields("id"));

    Config conf = new Config();
    conf.setDebug(false);
    // System.exit(1);

    if (!pt.has("local")) {
      conf.setNumWorkers(par);

      StormSubmitter.submitTopologyWithProgressBar(
          "throughput-" + pt.get("name", "no_name"), conf, builder.createTopology());
    } else {
      conf.setMaxTaskParallelism(par);

      LocalCluster cluster = new LocalCluster();
      cluster.submitTopology("throughput", conf, builder.createTopology());

      Thread.sleep(300000);

      cluster.shutdown();
    }
  }