Java JavaPairRDD.join примеры использования

Язык программирования: Java

Пространство имен/Пакет: org.apache.spark.api.java

Класс/Тип: JavaPairRDD

Метод/Функция: join

Примеров на hotexamples.com: 4

Java JavaPairRDD.join - 4 примера найдено. Это лучшие примеры Java кода для org.apache.spark.api.java.JavaPairRDD.join, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

collect(16)

reduceByKey(9)

mapToPair(8)

filter(5)

count(5)

mapValues(5)

join(4)

zipWithIndex(3)

flatMapToPair(3)

sortByKey(3)

saveAsTextFile(3)

unpersist(2)

groupByKey(2)

map(2)

collectAsMap(2)

flatMap(2)

values(1)

union(1)

toRDD(1)

saveAsNewAPIHadoopFile(1)

saveAsHadoopFile(1)

flatMapValues(1)

saveAsHadoopDataset(1)

rdd(1)

foreach(1)

partitionBy(1)

mapPartitionsToPair(1)

mapPartitions(1)

leftOuterJoin(1)

fromRDD(1)

fromJavaRDD(1)

partitions(1)

Пример #1

Показать файл

Файл: JavaTC.java Проект: ScorpiusAlpha/graphx

  public static void main(String[] args) {
    if (args.length == 0) {
      System.err.println("Usage: JavaTC <host> [<slices>]");
      System.exit(1);
    }

    JavaSparkContext sc =
        new JavaSparkContext(
            args[0],
            "JavaTC",
            System.getenv("SPARK_HOME"),
            JavaSparkContext.jarOfClass(JavaTC.class));
    Integer slices = (args.length > 1) ? Integer.parseInt(args[1]) : 2;
    JavaPairRDD<Integer, Integer> tc = sc.parallelizePairs(generateGraph(), slices).cache();

    // Linear transitive closure: each round grows paths by one edge,
    // by joining the graph's edges with the already-discovered paths.
    // e.g. join the path (y, z) from the TC with the edge (x, y) from
    // the graph to obtain the path (x, z).

    // Because join() joins on keys, the edges are stored in reversed order.
    JavaPairRDD<Integer, Integer> edges =
        tc.mapToPair(
            new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
              @Override
              public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
                return new Tuple2<Integer, Integer>(e._2(), e._1());
              }
            });

    long oldCount = 0;
    long nextCount = tc.count();
    do {
      oldCount = nextCount;
      // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
      // then project the result to obtain the new (x, z) paths.
      tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
      nextCount = tc.count();
    } while (nextCount != oldCount);

    System.out.println("TC has " + tc.count() + " edges.");
    System.exit(0);
  }

Пример #2

Показать файл

Файл: ALSUpdate.java Проект: ChingChien/oryx

  @Override
  public void publishAdditionalModelData(
      JavaSparkContext sparkContext,
      PMML pmml,
      JavaRDD<String> newData,
      JavaRDD<String> pastData,
      Path modelParentPath,
      QueueProducer<String, String> modelUpdateQueue) {

    JavaRDD<String> allData = pastData == null ? newData : newData.union(pastData);

    log.info("Sending user / X data as model updates");
    String xPathString = PMMLUtils.getExtensionValue(pmml, "X");
    JavaPairRDD<Integer, double[]> userRDD =
        fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)));

    if (noKnownItems) {
      userRDD.foreach(new EnqueueFeatureVecsFn("X", modelUpdateQueue));
    } else {
      log.info("Sending known item data with model updates");
      JavaPairRDD<Integer, Collection<Integer>> knownItems = knownsRDD(allData, true);
      userRDD
          .join(knownItems)
          .foreach(new EnqueueFeatureVecsAndKnownItemsFn("X", modelUpdateQueue));
    }

    log.info("Sending item / Y data as model updates");
    String yPathString = PMMLUtils.getExtensionValue(pmml, "Y");
    JavaPairRDD<Integer, double[]> productRDD =
        fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)));

    // For now, there is no use in sending known users for each item
    // if (noKnownItems) {
    productRDD.foreach(new EnqueueFeatureVecsFn("Y", modelUpdateQueue));
    // } else {
    //  log.info("Sending known user data with model updates");
    //  JavaPairRDD<Integer,Collection<Integer>> knownUsers = knownsRDD(allData, false);
    //  productRDD.join(knownUsers).foreach(
    //      new EnqueueFeatureVecsAndKnownItemsFn("Y", modelUpdateQueue));
    // }
  }

Пример #3

Показать файл

Файл: StarJoinBloomQ3_4.java Проект: jaquejbrito/star-join-spark

  public static void main(String[] args) throws IOException {
    Parameters param = new Parameters();
    long initTime = System.currentTimeMillis();

    SparkConf conf = new SparkConf().setAppName("StarJoin");
    JavaSparkContext sc = new JavaSparkContext(conf);

    if (param.useKryo) {
      conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
      conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName());
      conf.set("spark.kryoserializer.buffer.mb", param.buffer);
    }

    MyBloomFilter.BloomFilter<String> BFS =
        new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes);
    MyBloomFilter.BloomFilter<String> BFD =
        new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes);
    MyBloomFilter.BloomFilter<String> BFC =
        new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes);

    JavaPairRDD<String, String> supps =
        sc.textFile(param.suppPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> s = supps.collect();
    for (int i = 0; i < s.size(); i++) {
      BFS.add(s.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS);

    JavaPairRDD<String, String> custs =
        sc.textFile(param.custPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> c = custs.collect();
    for (int i = 0; i < c.size(); i++) {
      BFC.add(c.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC);

    JavaPairRDD<String, String> dates =
        sc.textFile(param.datePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[6].equals("Dec1997");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[4]);
                  }
                });

    List<Tuple2<String, String>> d = dates.collect();
    for (int i = 0; i < d.size(); i++) {
      BFD.add(d.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD);

    JavaPairRDD<String, String[]> lines =
        sc.textFile(param.linePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return varC.value().contains(s[2].getBytes())
                        & varS.value().contains(s[4].getBytes())
                        & varD.value().contains(s[5].getBytes());
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String[]>() {
                  public Tuple2<String, String[]> call(String[] s) {
                    String[] v = {s[2], s[5], s[12]};
                    return new Tuple2<String, String[]>(s[4], v);
                  }
                });

    JavaPairRDD<String, String[]> result =
        lines
            .join(supps)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    result =
        result
            .join(custs)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    JavaPairRDD<String, Long> final_result =
        result
            .join(dates)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) {
                    return new Tuple2<String, Long>(
                        s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0]));
                  }
                })
            .reduceByKey(
                new Function2<Long, Long, Long>() {
                  public Long call(Long i1, Long i2) {
                    return i1 + i2;
                  }
                });

    JavaPairRDD<String, String> sub_result =
        final_result.mapToPair(
            new PairFunction<Tuple2<String, Long>, String, String>() {
              public Tuple2<String, String> call(Tuple2<String, Long> line) {
                return new Tuple2(line._1 + "," + line._2.toString(), null);
              }
            });

    final_result =
        sub_result
            .sortByKey(new Q3Comparator())
            .mapToPair(
                new PairFunction<Tuple2<String, String>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, String> line) {
                    String[] s = line._1.split(",");
                    return new Tuple2<String, Long>(
                        s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3]));
                  }
                });

    Configuration HDFSconf = new Configuration();
    FileSystem fs = FileSystem.get(HDFSconf);
    fs.delete(new Path(param.output), true);

    final_result.saveAsTextFile(param.output);

    long finalTime = System.currentTimeMillis();
    System.out.print("Tempo total(ms): ");
    System.out.println(finalTime - initTime);

    sc.close();
  }

Пример #4

Показать файл

Файл: LifeCycleAnalyserSpark.java Проект: antoinebastien/bigdata-event-stream-detection

  /**
   * Produce, train and decode the Hmm. Must be done after adding theme and before calculating any
   * strengths.
   *
   * @param sc the spark context
   * @param piThreshold Threshold on pi
   * @param aaThreshold Threshold on a
   * @param maxIterations Max number of iterations
   */
  public void analyse(
      JavaSparkContext sc, double piThreshold, double aaThreshold, int maxIterations) {
    int numberHiddenStates = (int) (numberOfThemes + 1);

    int numberObservableOutputSymbols = (int) numberOfWords;

    // setting up initial state probability distribution
    double[] pi = new double[numberHiddenStates];
    double initialStateDistribution = 1.0f / numberHiddenStates;
    for (int i = 0; i < numberHiddenStates; i++) {
      pi[i] = initialStateDistribution;
    }

    // setting up state transition probability distribution
    double[][] stateTransitionProbabilityDistribution =
        new double[numberHiddenStates][numberObservableOutputSymbols];
    double halfInitialStateDistribution = initialStateDistribution / 2.0;
    for (int i = 0; i < numberHiddenStates; i++) {
      for (int j = 0; j < numberObservableOutputSymbols; j++) {
        if (i == 0 && j == 0) {
          // .5 chance to stay in the background model
          stateTransitionProbabilityDistribution[i][j] = 0.5f;
        } else if (i == 0) {
          // equal chance to go from the background model to any other state
          stateTransitionProbabilityDistribution[i][j] = halfInitialStateDistribution;
        } else if (j == 0) {
          // .5 chance to return in the background model
          stateTransitionProbabilityDistribution[i][j] = 0.5f;
        } else if (i == j) {
          // .5 chance to return in the same state
          stateTransitionProbabilityDistribution[i][j] = 0.5f;
        } else {
          // no chance to go between any two different states
          stateTransitionProbabilityDistribution[i][j] = 0.0f;
        }
      }
    }

    // setting the output prob distribution
    outputProbabilityDistribution[0] = bgAsArray;

    // setting up and training the hmm
    if (outputProbabilityDistribution == null) {
      System.out.println(
          "error : you need to specify the themes via"
              + "addAllThemesFromRDD before analyzing the sequence!");
    }
    hmm =
        new Hmm2(
            numberHiddenStates,
            numberObservableOutputSymbols,
            pi,
            stateTransitionProbabilityDistribution,
            outputProbabilityDistribution);

    if (outputProbabilityDistribution == null) {
      System.out.println(
          "error : you need to specify the themes via addAllThemesFromRDD before analyzing the sequence!");
    }

    JavaPairRDD<Tuple2<Long, Long>, Long> wordStreamZippedWithIndex = wordStream.zipWithIndex();

    JavaRDD<Tuple2<Integer, Integer>> observedSequenceRdd =
        wordStreamZippedWithIndex.map(
            new Function<Tuple2<Tuple2<Long, Long>, Long>, Tuple2<Integer, Integer>>() {

              private static final long serialVersionUID = 1L;

              @Override
              public Tuple2<Integer, Integer> call(Tuple2<Tuple2<Long, Long>, Long> wordEntry)
                  throws Exception {
                return new Tuple2<Integer, Integer>(
                    wordEntry._2.intValue(), wordEntry._1._1.intValue());
              }
            });

    System.out.println("observedSequenceRdd length : " + observedSequenceRdd.count());
    System.out.println(
        "observedSequenceRdd : "
            + Arrays.toString(Arrays.copyOf(observedSequenceRdd.collect().toArray(), 50)));
    hmm.rawTrain(sc, observedSequenceRdd, piThreshold, aaThreshold, maxIterations);

    mostLikelySequenceThemeShifts = hmm.decode(sc, observedSequenceRdd);

    JavaPairRDD<Integer, Tuple2<Long, Long>> wordStreamZippedWithIndexReversed =
        wordStreamZippedWithIndex.mapToPair(
            new PairFunction<Tuple2<Tuple2<Long, Long>, Long>, Integer, Tuple2<Long, Long>>() {

              private static final long serialVersionUID = 1L;

              @Override
              public Tuple2<Integer, Tuple2<Long, Long>> call(Tuple2<Tuple2<Long, Long>, Long> arg0)
                  throws Exception {
                // TODO Auto-generated method stub
                return new Tuple2<Integer, Tuple2<Long, Long>>(arg0._2.intValue(), arg0._1);
              }
            });

    JavaPairRDD<Integer, Tuple2<Integer, Tuple2<Long, Long>>> zippedDecodedSequence =
        mostLikelySequenceThemeShifts.join(wordStreamZippedWithIndexReversed);

    JavaRDD<Tuple2<Long, Integer>> nonZeroMostLikelyByTimestamp =
        zippedDecodedSequence.flatMap(
            new FlatMapFunction<
                Tuple2<Integer, Tuple2<Integer, Tuple2<Long, Long>>>, Tuple2<Long, Integer>>() {

              @Override
              public Iterable<Tuple2<Long, Integer>> call(
                  Tuple2<Integer, Tuple2<Integer, Tuple2<Long, Long>>> arg0) throws Exception {
                ArrayList<Tuple2<Long, Integer>> list = new ArrayList<Tuple2<Long, Integer>>(1);
                if (arg0._2._1 != 0) {
                  list.add(new Tuple2<Long, Integer>(arg0._2._2._2, arg0._2._1));
                }
                return list;
              }
            });

    JavaPairRDD<Long, Iterable<Tuple2<Long, Integer>>> groupedRdd =
        nonZeroMostLikelyByTimestamp.groupBy(
            new Function<Tuple2<Long, Integer>, Long>() {

              @Override
              public Long call(Tuple2<Long, Integer> arg0) throws Exception {
                return arg0._1;
              }
            });
    List<Long> timestampsList = groupedRdd.keys().collect();

    Collections.sort(timestampsList);

    JavaPairRDD<Long, Map<Integer, Integer>> resultByTimestamp =
        groupedRdd.mapValues(
            new Function<Iterable<Tuple2<Long, Integer>>, Map<Integer, Integer>>() {

              @Override
              public Map<Integer, Integer> call(Iterable<Tuple2<Long, Integer>> arg0)
                  throws Exception {
                Map<Integer, Integer> countMap = new HashMap<Integer, Integer>();
                for (Tuple2<Long, Integer> tuple : arg0) {
                  if (countMap.containsKey(tuple._2)) {
                    countMap.put(tuple._2, countMap.get(tuple._2) + 1);
                  } else {
                    countMap.put(tuple._2, 1);
                  }
                }
                return countMap;
              }
            });

    List<Tuple2<Long, Map<Integer, Integer>>> collectedResults = resultByTimestamp.collect();

    Collections.sort(
        collectedResults,
        new Comparator<Tuple2<Long, Map<Integer, Integer>>>() {
          @Override
          public int compare(
              Tuple2<Long, Map<Integer, Integer>> index1,
              Tuple2<Long, Map<Integer, Integer>> index2) {
            return index1._1.compareTo(index2._1);
          }
        });

    // Printing in the appropriate format for the csv file
    for (int themeIndex = 1; themeIndex < numberOfThemes; themeIndex++) {
      System.out.println("Theme " + (themeIndex - 1));
      Iterator<Tuple2<Long, Map<Integer, Integer>>> resultsIterator = collectedResults.iterator();
      Iterator<Long> timestampsIterator = timestampsList.iterator();

      Tuple2<Long, Map<Integer, Integer>> currentTuple = resultsIterator.next();

      while (timestampsIterator.hasNext()) {
        long timestamp = timestampsIterator.next();
        if (currentTuple != null && timestamp == currentTuple._1) {
          int strength =
              currentTuple._2().get(themeIndex) == null ? 0 : currentTuple._2().get(themeIndex);
          System.out.println(timestamp + "," + strength);
          if (resultsIterator.hasNext()) {
            currentTuple = resultsIterator.next();
          } else {
            currentTuple = null;
          }

        } else {
          System.out.println(timestamp + ",0");
        }
      }
    }

    // old way to do it
    int timeDuration = collectedResults.size();
    int[][] themesStrengths = new int[(int) numberOfThemes][timeDuration];

    for (int timeIndex = 0; timeIndex < timeDuration; timeIndex++) {
      Tuple2<Long, Map<Integer, Integer>> tuple = collectedResults.get(timeIndex);
      for (Entry<Integer, Integer> entry : tuple._2.entrySet()) {
        themesStrengths[entry.getKey() - 1][timeIndex] = entry.getValue();
      }
    }

    System.out.println("");
    for (int i = 0; i < numberOfThemes; i++) {
      System.out.print("themeStrength_" + i + " = [");
      for (int j = 0; j < timeDuration; j++) {
        System.out.print(" " + themesStrengths[i][j]);
      }
      System.out.println("];");
    }
  }