Java JavaPairRDD.count Beispiele

Programmiersprache: Java

Namespace / Paketname: org.apache.spark.api.java

Klasse / Typ: JavaPairRDD

Methode / Funktion: count

Beispiele auf hotexamples.com: 5

Java JavaPairRDD.count - 5 Beispiele gefunden. Dies sind die am besten bewerteten Java Beispiele für die org.apache.spark.api.java.JavaPairRDD.count, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

collect(16)

reduceByKey(9)

mapToPair(8)

filter(5)

count(5)

mapValues(5)

join(4)

zipWithIndex(3)

flatMapToPair(3)

sortByKey(3)

saveAsTextFile(3)

unpersist(2)

groupByKey(2)

map(2)

collectAsMap(2)

flatMap(2)

values(1)

union(1)

toRDD(1)

saveAsNewAPIHadoopFile(1)

saveAsHadoopFile(1)

flatMapValues(1)

saveAsHadoopDataset(1)

rdd(1)

foreach(1)

partitionBy(1)

mapPartitionsToPair(1)

mapPartitions(1)

leftOuterJoin(1)

fromRDD(1)

fromJavaRDD(1)

partitions(1)

Beispiel #1

Datei anzeigen

Datei: JavaTC.java Projekt: ScorpiusAlpha/graphx

  public static void main(String[] args) {
    if (args.length == 0) {
      System.err.println("Usage: JavaTC <host> [<slices>]");
      System.exit(1);
    }

    JavaSparkContext sc =
        new JavaSparkContext(
            args[0],
            "JavaTC",
            System.getenv("SPARK_HOME"),
            JavaSparkContext.jarOfClass(JavaTC.class));
    Integer slices = (args.length > 1) ? Integer.parseInt(args[1]) : 2;
    JavaPairRDD<Integer, Integer> tc = sc.parallelizePairs(generateGraph(), slices).cache();

    // Linear transitive closure: each round grows paths by one edge,
    // by joining the graph's edges with the already-discovered paths.
    // e.g. join the path (y, z) from the TC with the edge (x, y) from
    // the graph to obtain the path (x, z).

    // Because join() joins on keys, the edges are stored in reversed order.
    JavaPairRDD<Integer, Integer> edges =
        tc.mapToPair(
            new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
              @Override
              public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
                return new Tuple2<Integer, Integer>(e._2(), e._1());
              }
            });

    long oldCount = 0;
    long nextCount = tc.count();
    do {
      oldCount = nextCount;
      // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
      // then project the result to obtain the new (x, z) paths.
      tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
      nextCount = tc.count();
    } while (nextCount != oldCount);

    System.out.println("TC has " + tc.count() + " edges.");
    System.exit(0);
  }

Beispiel #2

Datei anzeigen

Datei: LifeCycleAnalyserSpark.java Projekt: antoinebastien/bigdata-event-stream-detection

 /**
  * Class used to analyze themes life cycle.
  *
  * @param hmmInput The hmmInput from which is going to be used the background model, the lexicon
  *     and the wordStream. Themes must be added before any analytics can be done.
  */
 public LifeCycleAnalyserSpark(HmmInputFromParser hmmInput) {
   this.wordStream = hmmInput.wordStream;
   this.lexicon = hmmInput.lexicon;
   this.lexiconAsMap = lexicon.collectAsMap();
   getInvertedLexicon();
   numberOfThemes = 0L;
   numberOfWords = lexicon.count();
   // themes = new ArrayList<double[]>();
   setBackgroundModelAsThemebyId(hmmInput.backgroundModelById);
 }

Beispiel #3

Datei anzeigen

Datei: MainRunner.java Projekt: donpir/JMATRW

  public static void main(String[] args) {
    SparkConf sparkconf =
        new SparkConf()
            .setAppName("Simple Application")
            .setMaster("spark://1.245.77.10:7077")
            .set(
                "spark.driver.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set(
                "spark.executor.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set("fs.default.name", "file:///");
    JavaSparkContext sc = new JavaSparkContext(sparkconf);
    Configuration hadoopConfig = sc.hadoopConfiguration();
    hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw-0.2.jar");
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw4spark-0.2.jar");

    /*JavaRDD<Double> matrdd2 = sc.parallelize(Arrays.asList(1.0, 3.0, 2.0));
    System.out.println("Start counting parallelize...");
    long values = matrdd2.count();
    System.out.println("Value count of parallelize is " + values);*/

    JavaPairRDD<Long, Double> matrdd =
        sc.newAPIHadoopFile(
            "e:/tmp/vecRow03_x256.mat",
            JMATFileInputFormat.class,
            Long.class,
            Double.class,
            hadoopConfig);
    System.out.println("Start job...");
    long values = matrdd.count();
    System.out.println("Value count of hadoop is " + values);

    sc.stop();
    sc.close();
  }

Beispiel #4

Datei anzeigen

Datei: SparkModelUtils.java Projekt: phamthaithinh/carbon-ml

  /**
   * A utility method to generate class classification model summary
   *
   * @param predictionsAndLabels Predictions and actual labels
   * @return Class classification model summary
   */
  public static ClassClassificationAndRegressionModelSummary getClassClassificationModelSummary(
      JavaSparkContext sparkContext,
      JavaRDD<LabeledPoint> testingData,
      JavaPairRDD<Double, Double> predictionsAndLabels) {
    ClassClassificationAndRegressionModelSummary classClassificationModelSummary =
        new ClassClassificationAndRegressionModelSummary();
    // store predictions and actuals
    List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>();
    for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) {
      PredictedVsActual predictedVsActual = new PredictedVsActual();
      predictedVsActual.setPredicted(scoreAndLabel._1());
      predictedVsActual.setActual(scoreAndLabel._2());
      predictedVsActuals.add(predictedVsActual);
    }
    // create a list of feature values
    List<double[]> features = new ArrayList<double[]>();
    for (LabeledPoint labeledPoint : testingData.collect()) {
      if (labeledPoint != null && labeledPoint.features() != null) {
        double[] rowFeatures = labeledPoint.features().toArray();
        features.add(rowFeatures);
      }
    }
    // create a list of feature values with predicted vs. actuals
    List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>();
    for (int i = 0; i < features.size(); i++) {
      TestResultDataPoint testResultDataPoint = new TestResultDataPoint();
      testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i));
      testResultDataPoint.setFeatureValues(features.get(i));
      testResultDataPoints.add(testResultDataPoint);
    }
    // covert List to JavaRDD
    JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD =
        sparkContext.parallelize(testResultDataPoints);
    // collect RDD as a sampled list
    List<TestResultDataPoint> testResultDataPointsSample;
    if (testResultDataPointsJavaRDD.count()
        > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) {
      testResultDataPointsSample =
          testResultDataPointsJavaRDD.takeSample(
              true,
              MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize());
    } else {
      testResultDataPointsSample = testResultDataPointsJavaRDD.collect();
    }
    classClassificationModelSummary.setTestResultDataPointsSample(testResultDataPointsSample);
    classClassificationModelSummary.setPredictedVsActuals(predictedVsActuals);
    // calculate test error
    double error =
        1.0
            * predictionsAndLabels
                .filter(
                    new Function<Tuple2<Double, Double>, Boolean>() {
                      private static final long serialVersionUID = -3063364114286182333L;

                      @Override
                      public Boolean call(Tuple2<Double, Double> pl) {
                        return !pl._1().equals(pl._2());
                      }
                    })
                .count()
            / predictionsAndLabels.count();
    classClassificationModelSummary.setError(error);
    return classClassificationModelSummary;
  }

Beispiel #5

Datei anzeigen

Datei: Test.java Projekt: Stronhold/NewsClasifier

  public static void main(String[] args) {

    // Path de resultados
    String pathResults = "results";

    String pathToCategories = "values.txt";
    String pathToWords = "words.txt";
    File file = new File(pathToWords);

    HashMap<Double, String> categoriesDict = new HashMap<>();
    HashMap<String, String> resultado = new HashMap<>();

    FileInputStream fis = null;
    try {
      fis = new FileInputStream(pathToCategories);
      // Construct BufferedReader from InputStreamReader
      BufferedReader br = new BufferedReader(new InputStreamReader(fis));

      String line = null;
      while ((line = br.readLine()) != null) {
        String[] words = line.split(" ");
        categoriesDict.put(Double.valueOf(words[0]), words[1]);
      }
      br.close();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }

    // Path donde estaran las categorias
    String pathCategories = "src/main/resources/categoriestest/";

    // Configuracion basica de la aplicacion
    SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesTest").setMaster("local[*]");

    // Creacion del contexto
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    NaiveBayesModel model = NaiveBayesModel.load(jsc.sc(), pathResults);

    HashMap<String, String> dictionary = loadDictionary();

    JavaRDD<String> fileWords = null;

    if (file.exists()) {
      JavaRDD<String> input = jsc.textFile(pathToWords);
      fileWords =
          input.flatMap(
              new FlatMapFunction<String, String>() {
                @Override
                public Iterable<String> call(String s) throws Exception {
                  return Arrays.asList(s.split(" "));
                }
              });
    } else {
      System.out.println("Error, there is no words");
      System.exit(-1);
    }
    ArrayList<String> aFileWords = (ArrayList<String>) fileWords.collect();

    // Cogemos el fichero en el que se encuentran las categorias
    File dir = new File(pathCategories);
    for (File f : dir.listFiles()) {
      JavaRDD<String> input = jsc.textFile(f.getPath());
      JavaRDD<String> words =
          input.flatMap(
              new FlatMapFunction<String, String>() {
                @Override
                public Iterable<String> call(String s) throws Exception {
                  return Arrays.asList(s.split(" "));
                }
              });
      JavaPairRDD<String, Double> wordCount = Reducer.parseWords(words, dictionary);
      List<Tuple2<String, Double>> total = wordCount.collect();
      List<Tuple2<String, Double>> elementsRemoved = new ArrayList<>();
      for (Tuple2<String, Double> t : total) {
        if (!t._1.equals("")) {
          elementsRemoved.add(new Tuple2<>(t._1, t._2 / wordCount.count()));
        }
      }
      ArrayList<Tuple2<String, Double>> freqFinal = new ArrayList<>();
      for (String s : aFileWords) {
        boolean found = false;
        for (Tuple2<String, Double> t : elementsRemoved) {
          if (t._1.equals(s)) {
            found = true;
            freqFinal.add(t);
            break;
          }
        }
        if (!found) {
          freqFinal.add(new Tuple2<String, Double>(s, 0.0));
        }
      }
      double[] v = new double[freqFinal.size()];
      for (int i = 0; i < freqFinal.size(); i++) {
        Tuple2<String, Double> t = freqFinal.get(i);
        v[i] = t._2;
      }
      org.apache.spark.mllib.linalg.Vector vector = Vectors.dense(v);
      /**/
      double d = model.predict(vector);
      System.out.println(categoriesDict.get(d));
      resultado.put(f.getName(), categoriesDict.get(d));
    }
    jsc.stop();
    try {
      Thread.sleep(2000);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    for (String key : resultado.keySet()) {
      System.out.println(key + " - " + resultado.get(key));
    }
  }