@SuppressWarnings("serial")
  @Test
  public void testRun() throws Exception {
    int verticesCount = 5000;
    int edgesCount = verticesCount * 2;

    ExecutionEnvironment environment = ExecutionEnvironment.getExecutionEnvironment();
    environment.getConfig().disableSysoutLogging();

    Graph<Long, Long, Long> graph =
        GraphGenerator.generateGraph(verticesCount, edgesCount, environment);

    PCConnectedComponents<Long, Long> algo = new PCConnectedComponents<>(verticesCount);

    List<Tuple2<Long, Long>> result =
        algo.run(graph)
            .map(
                new RichMapFunction<Vertex<Long, Long>, Tuple2<Long, Long>>() {
                  @Override
                  public Tuple2<Long, Long> map(Vertex<Long, Long> value) throws Exception {
                    return new Tuple2<>(value.getId(), value.getValue());
                  }
                })
            .collect();

    ConnectedComponentsData.checkOddEvenResult(result);
  }
Ejemplo n.º 2
0
  public static void main(String[] args) throws Exception {

    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    System.out.println(
        "Usage: KMeans --points <path> --centroids <path> --output <path> --iterations <n>");

    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.getConfig()
        .setGlobalJobParameters(params); // make parameters available in the web interface

    // get input data:
    // read the points and centroids from the provided paths or fall back to default data
    DataSet<Point> points = getPointDataSet(params, env);
    DataSet<Centroid> centroids = getCentroidDataSet(params, env);

    // set number of bulk iterations for KMeans algorithm
    IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10));

    DataSet<Centroid> newCentroids =
        points
            // compute closest centroid for each point
            .map(new SelectNearestCenter())
            .withBroadcastSet(loop, "centroids")
            // count and sum point coordinates for each centroid
            .map(new CountAppender())
            .groupBy(0)
            .reduce(new CentroidAccumulator())
            // compute new centroids from point counts and coordinate sums
            .map(new CentroidAverager());

    // feed new centroids back into next iteration
    DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);

    DataSet<Tuple2<Integer, Point>> clusteredPoints =
        points
            // assign points to final clusters
            .map(new SelectNearestCenter())
            .withBroadcastSet(finalCentroids, "centroids");

    // emit result
    if (params.has("output")) {
      clusteredPoints.writeAsCsv(params.get("output"), "\n", " ");

      // since file sinks are lazy, we trigger the execution explicitly
      env.execute("KMeans Example");
    } else {
      System.out.println("Printing result to stdout. Use --output to specify output path.");
      clusteredPoints.print();
    }
  }
Ejemplo n.º 3
0
  public static void main(String[] args) throws Exception {

    final ParameterTool params = ParameterTool.fromArgs(args);

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    System.out.println(
        "Usage: WebLogAnalysis --documents <path> --ranks <path> --visits <path> --output <path>");

    env.getConfig().setGlobalJobParameters(params);

    // get input data
    DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
    DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
    DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);

    // Retain documents with keywords
    DataSet<Tuple1<String>> filterDocs = documents.filter(new FilterDocByKeyWords()).project(0);

    // Filter ranks by minimum rank
    DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks.filter(new FilterByRank());

    // Filter visits by visit date
    DataSet<Tuple1<String>> filterVisits = visits.filter(new FilterVisitsByDate()).project(0);

    // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
    DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
        filterDocs.join(filterRanks).where(0).equalTo(1).projectSecond(0, 1, 2);

    // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain
    // time
    DataSet<Tuple3<Integer, String, Integer>> result =
        joinDocsRanks.coGroup(filterVisits).where(1).equalTo(0).with(new AntiJoinVisits());

    // emit result
    if (params.has("output")) {
      result.writeAsCsv(params.get("output"), "\n", "|");
      // execute program
      env.execute("WebLogAnalysis Example");
    } else {
      System.out.println("Printing result to stdout. Use --output to specify output path.");
      result.print();
    }
  }
  protected void internalNonBlockingStart() throws IOException {

    Plan plan = env.createProgramPlan();

    Optimizer optimizer =
        new Optimizer(new DataStatistics(), new org.apache.flink.configuration.Configuration());
    OptimizedPlan optimizedPlan = optimizer.compile(plan);

    final JobGraph jobGraph = new JobGraphGenerator().compileJobGraph(optimizedPlan);
    for (String jarPath : classPath) {
      jobGraph.addJar(new Path(jarPath));
    }

    jobID = jobGraph.getJobID();
    accumulatorCache.setJobID(jobID);

    if (isLocalExecution()) {

      flowStep.logInfo("Executing in local mode.");

      startLocalCluster();

      org.apache.flink.configuration.Configuration config =
          new org.apache.flink.configuration.Configuration();
      config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, localCluster.hostname());

      client = new Client(config);
      client.setPrintStatusDuringExecution(env.getConfig().isSysoutLoggingEnabled());

    } else {

      flowStep.logInfo("Executing in cluster mode.");

      try {
        String path =
            this.getClass().getProtectionDomain().getCodeSource().getLocation().toURI().getPath();
        jobGraph.addJar(new Path(path));
        classPath.add(path);
      } catch (URISyntaxException e) {
        throw new IOException("Could not add the submission JAR as a dependency.");
      }

      client = ((ContextEnvironment) env).getClient();
    }

    List<URL> fileList = new ArrayList<URL>(classPath.size());
    for (String path : classPath) {
      URL url;
      try {
        url = new URL(path);
      } catch (MalformedURLException e) {
        url = new URL("file://" + path);
      }
      fileList.add(url);
    }

    final ClassLoader loader =
        JobWithJars.buildUserCodeClassLoader(
            fileList, Collections.<URL>emptyList(), getClass().getClassLoader());

    accumulatorCache.setClient(client);

    final Callable<JobSubmissionResult> callable =
        new Callable<JobSubmissionResult>() {
          @Override
          public JobSubmissionResult call() throws Exception {
            return client.runBlocking(jobGraph, loader);
          }
        };

    jobSubmission = executorService.submit(callable);

    flowStep.logInfo("submitted Flink job: " + jobID);
  }