Java JavaPairRDD.collectAsMap示例

  public static void main(String[] args) {

    if (args.length == 0) {
      System.err.println("Usage: Main <file>");
      System.exit(1);
    }

    SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile(args[0]);

    JavaPairRDD<String, Double> dayArrivalDelayPair =
        lines.flatMapToPair(
            line -> {
              String[] splitLine = line.split(SPLIT_PATTERN);
              String key = splitLine.length == 0 ? "" : splitLine[0];
              Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]);
              return Arrays.asList(new Tuple2<>(key, value));
            });

    JavaPairRDD<String, AverageWrapper> dayAverageWrapper =
        dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1));

    JavaPairRDD<String, AverageWrapper> daysValueCount =
        dayAverageWrapper.reduceByKey(
            (aw1, aw2) ->
                new AverageWrapper(
                    aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount()));

    Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap();
    List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>();
    listResults.addAll(resultMap.entrySet());
    Collections.sort(
        listResults,
        (entry1, entry2) ->
            Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue()));

    for (Map.Entry<String, AverageWrapper> entry : listResults) {
      System.out.printf(
          "%s -> (%f, %d)\n",
          entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount());
    }

    //        JavaPairRDD<String, Double> resultRDD =
    //                daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() /
    // averageWrapper.getCount());
    //
    //        Map<String, Double> results = resultRDD.collectAsMap();

    //        List<Map.Entry<String, Double>> listResults = new ArrayList<>();
    //        listResults.addAll(results.entrySet());
    //        Collections.sort(listResults, (entry1, entry2) ->
    // entry1.getValue().compareTo(entry2.getValue()));
    //
    //        for (Map.Entry<String, Double> entry : listResults) {
    //            System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue());
    //        }
  }

示例#2

显示文件

文件： LifeCycleAnalyserSpark.java 项目： antoinebastien/bigdata-event-stream-detection

 /**
  * Class used to analyze themes life cycle.
  *
  * @param hmmInput The hmmInput from which is going to be used the background model, the lexicon
  *     and the wordStream. Themes must be added before any analytics can be done.
  */
 public LifeCycleAnalyserSpark(HmmInputFromParser hmmInput) {
   this.wordStream = hmmInput.wordStream;
   this.lexicon = hmmInput.lexicon;
   this.lexiconAsMap = lexicon.collectAsMap();
   getInvertedLexicon();
   numberOfThemes = 0L;
   numberOfWords = lexicon.count();
   // themes = new ArrayList<double[]>();
   setBackgroundModelAsThemebyId(hmmInput.backgroundModelById);
 }