@SuppressWarnings("unchecked")
 @Test
 public void testFoldLeftValueComparator() {
   List<Tuple2<Integer, TimeValue>> pairs =
       Lists.newArrayList(
           tuple2(5, new TimeValue(2, 0.5)),
           tuple2(1, new TimeValue(1, 1.2)),
           tuple2(5, new TimeValue(1, 1.0)),
           tuple2(1, new TimeValue(2, 2.0)),
           tuple2(1, new TimeValue(3, 3.0)));
   JavaPairRDD<Integer, TimeValue> p = jsc().parallelizePairs(pairs);
   GroupSorted<Integer, TimeValue> gs =
       new GroupSorted(p, new HashPartitioner(2), new TimeValueComparator());
   JavaPairRDD<Integer, Double> emas =
       gs.foldLeftByKey(
           0.0,
           new Function2<Double, TimeValue, Double>() {
             public Double call(Double acc, TimeValue tv) {
               return 0.8 * acc + 0.2 * tv.getValue();
             }
           });
   System.out.println(ImmutableSet.copyOf(emas.collect()));
   Assert.assertTrue(
       ImmutableSet.copyOf(emas.collect())
           .equals(ImmutableSet.of(tuple2(1, 1.0736), tuple2(5, 0.26))));
 }
Exemple #2
0
 @SuppressWarnings("serial")
 @Override
 public SortedCounts<String> execute(final JavaSparkContext spark) {
   final JavaRDD<String> textFile = spark.textFile(inputFile);
   final JavaRDD<String> words =
       textFile.flatMap(
           new FlatMapFunction<String, String>() {
             @Override
             public Iterable<String> call(final String rawJSON) throws TwitterException {
               final Status tweet = TwitterObjectFactory.createStatus(rawJSON);
               String text = tweet.getText();
               return Arrays.asList(text.split(" "));
             }
           });
   final JavaPairRDD<String, Integer> pairs =
       words.mapToPair(
           new PairFunction<String, String, Integer>() {
             @Override
             public Tuple2<String, Integer> call(final String s) {
               return new Tuple2<String, Integer>(s.toLowerCase(), 1);
             }
           });
   final JavaPairRDD<String, Integer> counts =
       pairs.reduceByKey(
           new Function2<Integer, Integer, Integer>() {
             @Override
             public Integer call(final Integer a, final Integer b) {
               return a + b;
             }
           });
   return SortedCounts.create(counts);
 }
  /**
   * This method builds a decision tree model
   *
   * @param sparkContext JavaSparkContext initialized with the application
   * @param modelID Model ID
   * @param trainingData Training data as a JavaRDD of LabeledPoints
   * @param testingData Testing data as a JavaRDD of LabeledPoints
   * @param workflow Machine learning workflow
   * @param mlModel Deployable machine learning model
   * @throws MLModelBuilderException
   */
  private ModelSummary buildDecisionTreeModel(
      JavaSparkContext sparkContext,
      long modelID,
      JavaRDD<LabeledPoint> trainingData,
      JavaRDD<LabeledPoint> testingData,
      Workflow workflow,
      MLModel mlModel,
      SortedMap<Integer, String> includedFeatures,
      Map<Integer, Integer> categoricalFeatureInfo)
      throws MLModelBuilderException {
    try {
      Map<String, String> hyperParameters = workflow.getHyperParameters();
      DecisionTree decisionTree = new DecisionTree();
      DecisionTreeModel decisionTreeModel =
          decisionTree.train(
              trainingData,
              getNoOfClasses(mlModel),
              categoricalFeatureInfo,
              hyperParameters.get(MLConstants.IMPURITY),
              Integer.parseInt(hyperParameters.get(MLConstants.MAX_DEPTH)),
              Integer.parseInt(hyperParameters.get(MLConstants.MAX_BINS)));

      // remove from cache
      trainingData.unpersist();
      // add test data to cache
      testingData.cache();

      JavaPairRDD<Double, Double> predictionsAndLabels =
          decisionTree.test(decisionTreeModel, testingData).cache();
      ClassClassificationAndRegressionModelSummary classClassificationAndRegressionModelSummary =
          SparkModelUtils.getClassClassificationModelSummary(
              sparkContext, testingData, predictionsAndLabels);

      // remove from cache
      testingData.unpersist();

      mlModel.setModel(new MLDecisionTreeModel(decisionTreeModel));

      classClassificationAndRegressionModelSummary.setFeatures(
          includedFeatures.values().toArray(new String[0]));
      classClassificationAndRegressionModelSummary.setAlgorithm(
          SUPERVISED_ALGORITHM.DECISION_TREE.toString());

      MulticlassMetrics multiclassMetrics =
          getMulticlassMetrics(sparkContext, predictionsAndLabels);

      predictionsAndLabels.unpersist();

      classClassificationAndRegressionModelSummary.setMulticlassConfusionMatrix(
          getMulticlassConfusionMatrix(multiclassMetrics, mlModel));
      Double modelAccuracy = getModelAccuracy(multiclassMetrics);
      classClassificationAndRegressionModelSummary.setModelAccuracy(modelAccuracy);
      classClassificationAndRegressionModelSummary.setDatasetVersion(workflow.getDatasetVersion());

      return classClassificationAndRegressionModelSummary;
    } catch (Exception e) {
      throw new MLModelBuilderException(
          "An error occurred while building decision tree model: " + e.getMessage(), e);
    }
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    // get rdd and broadcast inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> inX =
        sec.getBinaryBlockRDDHandleForVariable(_input1.getName());
    PartitionedBroadcastMatrix inV = sec.getBroadcastForVariable(_input2.getName());

    // execute mapmmchain (guaranteed to have single output block)
    MatrixBlock out = null;
    if (_chainType == ChainType.XtXv) {
      RDDMapMMChainFunction fmmc = new RDDMapMMChainFunction(inV);
      JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = inX.mapValues(fmmc);
      out = RDDAggregateUtils.sumStable(tmp);
    } else { // ChainType.XtwXv
      PartitionedBroadcastMatrix inW = sec.getBroadcastForVariable(_input3.getName());
      RDDMapMMChainFunction2 fmmc = new RDDMapMMChainFunction2(inV, inW);
      JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = inX.mapToPair(fmmc);
      out = RDDAggregateUtils.sumStable(tmp);
    }

    // put output block into symbol table (no lineage because single block)
    // this also includes implicit maintenance of matrix characteristics
    sec.setMatrixOutput(_output.getName(), out);
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    String rddVar = (_type == CacheType.LEFT) ? input2.getName() : input1.getName();
    String bcastVar = (_type == CacheType.LEFT) ? input1.getName() : input2.getName();
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(output.getName());
    long rlen =
        sec.getScalarInput(_nrow.getName(), _nrow.getValueType(), _nrow.isLiteral()).getLongValue();

    // get inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
    PartitionedBroadcastMatrix in2 = sec.getBroadcastForVariable(bcastVar);

    // execute pmm instruction
    JavaPairRDD<MatrixIndexes, MatrixBlock> out =
        in1.flatMapToPair(new RDDPMMFunction(_type, in2, rlen, mc.getRowsPerBlock()));
    out = RDDAggregateUtils.sumByKeyStable(out);

    // put output RDD handle into symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), rddVar);
    sec.addLineageBroadcast(output.getName(), bcastVar);

    // update output statistics if not inferred
    updateBinaryMMOutputMatrixCharacteristics(sec, false);
  }
 @Test
 public void wrapPairRDDFakeCtTest() {
   JavaInteropTestHelper helper = new JavaInteropTestHelper(sc());
   JavaInterop ji = new JavaInterop();
   RDD<Tuple2<String, Object>> rdd = helper.generateMiniPairRDD();
   JavaPairRDD prdd = ji.wrapPairRDDFakeCt(rdd);
   List<Tuple2<String, Long>> expected = Arrays.asList(new Tuple2<String, Long>("panda", 12L));
   assertEquals(expected, prdd.collect());
 }
  /**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   * @throws DMLUnsupportedOperationException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String itervar,
      String matrixvar,
      String program,
      String resultFile,
      MatrixObject input,
      ExecutionContext ec,
      PDataPartitionFormat dpf,
      OutputInfo oi,
      boolean tSparseCol, // config params
      boolean enableCPCaching,
      int numReducers) // opt params
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    String jobname = "ParFor-DPESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();

    // prepare input parameters
    MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData();
    MatrixCharacteristics mc = md.getMatrixCharacteristics();
    InputInfo ii = InputInfo.BinaryBlockInputInfo;

    // initialize accumulators for tasks/iterations
    Accumulator<Integer> aTasks = sc.accumulator(0);
    Accumulator<Integer> aIters = sc.accumulator(0);

    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
    DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf);
    RemoteDPParForSparkWorker efun =
        new RemoteDPParForSparkWorker(
            program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters);
    List<Tuple2<Long, String>> out =
        in.flatMapToPair(dpfun) // partition the input blocks
            .groupByKey(numReducers) // group partition blocks 		
            .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup
            .collect(); // get output handles

    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    int numTasks = aTasks.value(); // get accumulator value
    int numIters = aIters.value(); // get accumulator value

    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);

    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
      Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }

    return ret;
  }
Exemple #8
0
  public static void main(String[] args) {
    JavaSparkContext sc = new JavaSparkContext("local", "JavaAPISuite");

    JavaRDD<String> lines = sc.textFile("log.txt");
    JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")));
    JavaPairRDD<String, Integer> counts =
        words.mapToPair(w -> new Tuple2<String, Integer>(w, 1)).reduceByKey((x, y) -> x + y);

    counts.collect().forEach(t -> System.out.println("Key:" + t._1() + " Value:" + t._2()));
  }
 /**
  * Class used to analyze themes life cycle.
  *
  * @param hmmInput The hmmInput from which is going to be used the background model, the lexicon
  *     and the wordStream. Themes must be added before any analytics can be done.
  */
 public LifeCycleAnalyserSpark(HmmInputFromParser hmmInput) {
   this.wordStream = hmmInput.wordStream;
   this.lexicon = hmmInput.lexicon;
   this.lexiconAsMap = lexicon.collectAsMap();
   getInvertedLexicon();
   numberOfThemes = 0L;
   numberOfWords = lexicon.count();
   // themes = new ArrayList<double[]>();
   setBackgroundModelAsThemebyId(hmmInput.backgroundModelById);
 }
  /**
   * This method builds a naive bayes model
   *
   * @param sparkContext JavaSparkContext initialized with the application
   * @param modelID Model ID
   * @param trainingData Training data as a JavaRDD of LabeledPoints
   * @param testingData Testing data as a JavaRDD of LabeledPoints
   * @param workflow Machine learning workflow
   * @param mlModel Deployable machine learning model
   * @throws MLModelBuilderException
   */
  private ModelSummary buildNaiveBayesModel(
      JavaSparkContext sparkContext,
      long modelID,
      JavaRDD<LabeledPoint> trainingData,
      JavaRDD<LabeledPoint> testingData,
      Workflow workflow,
      MLModel mlModel,
      SortedMap<Integer, String> includedFeatures)
      throws MLModelBuilderException {
    try {
      Map<String, String> hyperParameters = workflow.getHyperParameters();
      NaiveBayesClassifier naiveBayesClassifier = new NaiveBayesClassifier();
      NaiveBayesModel naiveBayesModel =
          naiveBayesClassifier.train(
              trainingData, Double.parseDouble(hyperParameters.get(MLConstants.LAMBDA)));

      // remove from cache
      trainingData.unpersist();
      // add test data to cache
      testingData.cache();

      JavaPairRDD<Double, Double> predictionsAndLabels =
          naiveBayesClassifier.test(naiveBayesModel, testingData).cache();
      ClassClassificationAndRegressionModelSummary classClassificationAndRegressionModelSummary =
          SparkModelUtils.getClassClassificationModelSummary(
              sparkContext, testingData, predictionsAndLabels);

      // remove from cache
      testingData.unpersist();

      mlModel.setModel(new MLClassificationModel(naiveBayesModel));

      classClassificationAndRegressionModelSummary.setFeatures(
          includedFeatures.values().toArray(new String[0]));
      classClassificationAndRegressionModelSummary.setAlgorithm(
          SUPERVISED_ALGORITHM.NAIVE_BAYES.toString());

      MulticlassMetrics multiclassMetrics =
          getMulticlassMetrics(sparkContext, predictionsAndLabels);

      predictionsAndLabels.unpersist();

      classClassificationAndRegressionModelSummary.setMulticlassConfusionMatrix(
          getMulticlassConfusionMatrix(multiclassMetrics, mlModel));
      Double modelAccuracy = getModelAccuracy(multiclassMetrics);
      classClassificationAndRegressionModelSummary.setModelAccuracy(modelAccuracy);
      classClassificationAndRegressionModelSummary.setDatasetVersion(workflow.getDatasetVersion());

      return classClassificationAndRegressionModelSummary;
    } catch (Exception e) {
      throw new MLModelBuilderException(
          "An error occurred while building naive bayes model: " + e.getMessage(), e);
    }
  }
 @SuppressWarnings("unchecked")
 @Test
 public void testGroupSortNoEffect() {
   List<Tuple2<Integer, Integer>> pairs =
       Lists.newArrayList(tuple2(1, 2), tuple2(2, 3), tuple2(1, 3), tuple2(3, 1), tuple2(2, 1));
   JavaPairRDD<Integer, Integer> p = jsc().parallelizePairs(pairs);
   GroupSorted<Integer, Integer> gs =
       new GroupSorted(p, new HashPartitioner(2), Ordering.natural());
   GroupSorted<Integer, Integer> gs1 =
       new GroupSorted(gs, new HashPartitioner(2), Ordering.natural());
   Assert.assertTrue(JavaPairRDD.toRDD(gs) == JavaPairRDD.toRDD(gs1));
 }
Exemple #12
0
  public static JavaRDD<IAtomContainer> sdfFilesToMols(String path, JavaSparkContext ctx) {
    JavaPairRDD<String, String> sdfFiles = ctx.wholeTextFiles(path);

    FlatMapFunction<Tuple2<String, String>, IAtomContainer> sdfBlockBuilder =
        new FlatMapFunction<Tuple2<String, String>, IAtomContainer>() {

          public Iterable<IAtomContainer> call(Tuple2<String, String> sdfFile) throws Exception {
            return SDFUtils.parseSDF(sdfFile._2());
          }
        };

    JavaRDD<IAtomContainer> molecules = sdfFiles.flatMap(sdfBlockBuilder);

    return molecules;
  }
 private void setBackgroundModelAsThemebyId(JavaPairRDD<Long, Double> backgroundModelById) {
   List<Tuple2<Long, Double>> bgCollected = backgroundModelById.collect();
   bgAsArray = new double[(int) numberOfWords];
   for (Tuple2<Long, Double> tuple : bgCollected) {
     bgAsArray[tuple._1.intValue()] = tuple._2;
   }
 }
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Usage: JavaWordCount <master> <file>");
      System.exit(1);
    }

    JavaSparkContext ctx =
        new JavaSparkContext(
            args[0],
            "JavaWordCount",
            System.getenv("SPARK_HOME"),
            JavaSparkContext.jarOfClass(JavaWordCount.class));
    JavaRDD<String> lines = ctx.textFile(args[1], 1);

    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) {
                return Arrays.asList(SPACE.split(s));
              }
            });

    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<?, ?> tuple : output) {
      System.out.println(tuple._1 + ": " + tuple._2);
    }
    System.exit(0);
  }
Exemple #15
0
  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      System.err.println("Usage: JavaWordCount <input_file> <output_file>");
      System.exit(1);
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterator<String> call(String s) {
                return Arrays.asList(SPACE.split(s)).iterator();
              }
            });

    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    /*
    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<?,?> tuple : output) {
      System.out.println(tuple._1() + ": " + tuple._2());
    }
    */
    counts.saveAsTextFile(args[1]);
    ctx.stop();
  }
  @Test
  @SuppressWarnings("unchecked")
  public void testJavaPairRDDFunctions() throws Exception {
    JavaPairRDD<String, Integer> mockPairRDD = mock(JavaPairRDD.class);
    RDD<Tuple2<String, Integer>> mockTuple2RDD = mock(RDD.class);
    when(mockPairRDD.rdd()).thenReturn(mockTuple2RDD);
    GemFireJavaPairRDDFunctions wrapper = javaFunctions(mockPairRDD);
    assertTrue(mockTuple2RDD == wrapper.rddf.rdd());

    Tuple3<SparkContext, GemFireConnectionConf, GemFireConnection> tuple3 = createCommonMocks();
    when(mockTuple2RDD.sparkContext()).thenReturn(tuple3._1());
    String regionPath = "testregion";
    wrapper.saveToGemfire(regionPath, tuple3._2());
    verify(mockTuple2RDD, times(1)).sparkContext();
    verify(tuple3._1(), times(1))
        .runJob(eq(mockTuple2RDD), any(Function2.class), any(ClassTag.class));
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in =
        sec.getBinaryBlockRDDHandleForVariable(input1.getName());

    // execute unary builtin operation
    UnaryOperator uop = (UnaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new RDDMatrixBuiltinUnaryOp(uop));

    // set output RDD
    updateUnaryOutputMatrixCharacteristics(sec);
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
  }
Exemple #18
0
  public static void main(String[] args) throws Exception {

    if (args.length < 1) {
      System.err.println("Usage: JavaWordCount <file>");
      System.exit(1);
    }

    SparkSession spark = SparkSession.builder().appName("JavaWordCount").getOrCreate();

    JavaRDD<String> lines = spark.read().text(args[0]).javaRDD();

    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterator<String> call(String s) {
                return Arrays.asList(SPACE.split(s)).iterator();
              }
            });

    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<>(s, 1);
              }
            });

    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<?, ?> tuple : output) {
      System.out.println(tuple._1() + ": " + tuple._2());
    }
    spark.stop();
  }
Exemple #19
0
  public static <T, N extends Number> SortedCounts<T> create(final JavaPairRDD<T, N> rdd) {

    final SortedSet<Entry<T>> sortedEntries = new TreeSet<>();
    for (final Tuple2<T, N> tuple : rdd.collect()) {
      sortedEntries.add(new Entry<T>(tuple._2.longValue(), tuple._1));
    }
    final SortedCounts<T> result = new SortedCounts<T>(sortedEntries);

    return result;
  }
  public static void main(String[] args) {
    if (args.length < 2) {
      System.err.println("Usage: NaiveBayesExample <training_data> <test_data>");
      System.exit(1);
    }
    String training_data_path = args[0];
    // https://class.coursera.org/cloudapplications-001/forum/thread?thread_id=1387
    // String test_data_path = args[0];
    String test_data_path = args[1];

    SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<LabeledPoint> train = sc.textFile(training_data_path).map(new DataToPoint());
    // JavaRDD<LabeledPoint> test = sc.textFile(training_data_path).map(new DataToPoint());
    JavaRDD<LabeledPoint> test = sc.textFile(test_data_path).map(new DataToPoint());

    final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0);

    JavaPairRDD<Double, Double> predictionAndLabel =
        test.mapToPair(
            new PairFunction<LabeledPoint, Double, Double>() {
              public Tuple2<Double, Double> call(LabeledPoint p) {
                return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
              }
            });

    double accuracy =
        predictionAndLabel
                .filter(
                    new Function<Tuple2<Double, Double>, Boolean>() {
                      public Boolean call(Tuple2<Double, Double> pl) {
                        return pl._1().equals(pl._2());
                      }
                    })
                .count()
            / (double) test.count();

    System.out.println(accuracy);

    sc.stop();
  }
Exemple #21
0
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setMaster("local").setAppName("My App");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile("src/main/resources/data.txt");

    @SuppressWarnings("serial")
    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) {
                return Arrays.asList(s.split(" "));
              }
            });

    @SuppressWarnings("serial")
    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    @SuppressWarnings("serial")
    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    List<Tuple2<String, Integer>> output = counts.collect();

    for (Tuple2<?, ?> tuple : output) {
      System.out.println(tuple._1() + "-> " + tuple._2());
    }
    sc.close();
  }
  @Test
  public void matchCitations() {

    // given

    when(inputCitationReader.readCitations(sparkContext, "/input/cit/path")).thenReturn(citations);
    when(inputCitationConverter.convertCitations(citations)).thenReturn(convertedCitations);
    when(convertedCitations.partitions()).thenReturn(Lists.newArrayList(16));
    when(convertedCitations.partitionBy(any())).thenReturn(repartitionedCitations);

    when(inputDocumentReader.readDocuments(sparkContext, "/input/doc/path")).thenReturn(documents);
    when(inputDocumentConverter.convertDocuments(documents)).thenReturn(convertedDocuments);
    when(convertedDocuments.partitions()).thenReturn(Lists.newArrayList(12));
    when(convertedDocuments.partitionBy(any())).thenReturn(repartitionedDocuments);

    when(coreCitationMatchingService.matchCitations(repartitionedCitations, repartitionedDocuments))
        .thenReturn(matched);
    when(outputConverter.convertMatchedCitations(matched)).thenReturn(convertedMatched);

    // execute

    citationMatchingService.matchCitations(
        sparkContext, "/input/cit/path", "/input/doc/path", "/output/path");

    // assert

    verify(inputCitationReader).readCitations(sparkContext, "/input/cit/path");
    verify(inputCitationConverter).convertCitations(citations);
    verify(convertedCitations).partitionBy(citationsPartitioner.capture());
    assertPartitioner(citationsPartitioner.getValue(), 5);

    verify(inputDocumentReader).readDocuments(sparkContext, "/input/doc/path");
    verify(inputDocumentConverter).convertDocuments(documents);
    verify(convertedDocuments).partitionBy(documentsPartitioner.capture());
    assertPartitioner(documentsPartitioner.getValue(), 5);

    verify(coreCitationMatchingService)
        .matchCitations(repartitionedCitations, repartitionedDocuments);

    verify(outputConverter).convertMatchedCitations(matched);
    verify(outputWriter).writeMatchedCitations(convertedMatched, "/output/path");
  }
 @SuppressWarnings("unchecked")
 @Test
 public void testMapStreamByKeyValueComparator() {
   List<Tuple2<String, Integer>> pairs =
       Lists.newArrayList(
           tuple2("a", 1), tuple2("b", 10), tuple2("a", 3), tuple2("b", 1), tuple2("c", 5));
   JavaPairRDD<String, Integer> p = jsc().parallelizePairs(pairs);
   GroupSorted<String, Integer> gs =
       new GroupSorted(p, new HashPartitioner(2), Ordering.natural().reverse());
   JavaPairRDD<String, Integer> max =
       gs.mapStreamByKey(
           new Function<Iterator<Integer>, Iterator<Integer>>() {
             public Iterator<Integer> call(Iterator<Integer> it) {
               return Iterators.singletonIterator(it.next());
             }
           });
   Assert.assertTrue(
       ImmutableSet.copyOf(max.collect())
           .equals(ImmutableSet.of(tuple2("a", 3), tuple2("b", 10), tuple2("c", 5))));
 }
Exemple #24
0
  @Override
  public void publishAdditionalModelData(
      JavaSparkContext sparkContext,
      PMML pmml,
      JavaRDD<String> newData,
      JavaRDD<String> pastData,
      Path modelParentPath,
      QueueProducer<String, String> modelUpdateQueue) {

    JavaRDD<String> allData = pastData == null ? newData : newData.union(pastData);

    log.info("Sending user / X data as model updates");
    String xPathString = PMMLUtils.getExtensionValue(pmml, "X");
    JavaPairRDD<Integer, double[]> userRDD =
        fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)));

    if (noKnownItems) {
      userRDD.foreach(new EnqueueFeatureVecsFn("X", modelUpdateQueue));
    } else {
      log.info("Sending known item data with model updates");
      JavaPairRDD<Integer, Collection<Integer>> knownItems = knownsRDD(allData, true);
      userRDD
          .join(knownItems)
          .foreach(new EnqueueFeatureVecsAndKnownItemsFn("X", modelUpdateQueue));
    }

    log.info("Sending item / Y data as model updates");
    String yPathString = PMMLUtils.getExtensionValue(pmml, "Y");
    JavaPairRDD<Integer, double[]> productRDD =
        fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)));

    // For now, there is no use in sending known users for each item
    // if (noKnownItems) {
    productRDD.foreach(new EnqueueFeatureVecsFn("Y", modelUpdateQueue));
    // } else {
    //  log.info("Sending known user data with model updates");
    //  JavaPairRDD<Integer,Collection<Integer>> knownUsers = knownsRDD(allData, false);
    //  productRDD.join(knownUsers).foreach(
    //      new EnqueueFeatureVecsAndKnownItemsFn("Y", modelUpdateQueue));
    // }
  }
  public static void wordCountJava8(String filename) {
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile(filename);

    // Java 8 with lambdas: split the input string into words
    JavaRDD<String> words = input.flatMap(s -> Arrays.asList(s.split(" ")));

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count
    // them
    JavaPairRDD<String, Integer> counts =
        words.mapToPair(t -> new Tuple2(t, 1)).reduceByKey((x, y) -> (int) x + (int) y);

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile("output");
  }
  private static void printKeyValStat(JavaRDD<LabeledPoint> rdd) {
    JavaPairRDD<Double, Integer> mapRDD =
        rdd.mapToPair(
            new PairFunction<LabeledPoint, Double, Integer>() {

              public Tuple2<Double, Integer> call(LabeledPoint point) throws Exception {
                return new Tuple2<Double, Integer>(point.label(), 1);
              }
            });

    Map<Double, Iterable<Integer>> map = mapRDD.groupByKey().collectAsMap();
    System.out.printf("Number of records for labels: ");
    for (Double key : map.keySet()) {
      int count = 0;
      for (int val : map.get(key)) {
        count += val;
      }
      System.out.printf(" %f::%d \t ", key, count);
    }
    System.out.printf("\n");
  }
  private void getInvertedLexicon() {
    this.invertedLexicon =
        lexicon.mapToPair(
            new PairFunction<Tuple2<String, Long>, Long, String>() {

              private static final long serialVersionUID = 1L;

              @Override
              public Tuple2<Long, String> call(Tuple2<String, Long> wordEntry) throws Exception {
                return new Tuple2<Long, String>(wordEntry._2, wordEntry._1);
              }
            });
  }
Exemple #28
0
  public static void main(String[] args) {
    SparkConf sparkconf =
        new SparkConf()
            .setAppName("Simple Application")
            .setMaster("spark://1.245.77.10:7077")
            .set(
                "spark.driver.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set(
                "spark.executor.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set("fs.default.name", "file:///");
    JavaSparkContext sc = new JavaSparkContext(sparkconf);
    Configuration hadoopConfig = sc.hadoopConfiguration();
    hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw-0.2.jar");
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw4spark-0.2.jar");

    /*JavaRDD<Double> matrdd2 = sc.parallelize(Arrays.asList(1.0, 3.0, 2.0));
    System.out.println("Start counting parallelize...");
    long values = matrdd2.count();
    System.out.println("Value count of parallelize is " + values);*/

    JavaPairRDD<Long, Double> matrdd =
        sc.newAPIHadoopFile(
            "e:/tmp/vecRow03_x256.mat",
            JMATFileInputFormat.class,
            Long.class,
            Double.class,
            hadoopConfig);
    System.out.println("Start job...");
    long values = matrdd.count();
    System.out.println("Value count of hadoop is " + values);

    sc.stop();
    sc.close();
  }
  public static void main(String[] args) {

    if (args.length == 0) {
      System.err.println("Usage: Main <file>");
      System.exit(1);
    }

    SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile(args[0]);

    JavaPairRDD<String, Double> dayArrivalDelayPair =
        lines.flatMapToPair(
            line -> {
              String[] splitLine = line.split(SPLIT_PATTERN);
              String key = splitLine.length == 0 ? "" : splitLine[0];
              Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]);
              return Arrays.asList(new Tuple2<>(key, value));
            });

    JavaPairRDD<String, AverageWrapper> dayAverageWrapper =
        dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1));

    JavaPairRDD<String, AverageWrapper> daysValueCount =
        dayAverageWrapper.reduceByKey(
            (aw1, aw2) ->
                new AverageWrapper(
                    aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount()));

    Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap();
    List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>();
    listResults.addAll(resultMap.entrySet());
    Collections.sort(
        listResults,
        (entry1, entry2) ->
            Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue()));

    for (Map.Entry<String, AverageWrapper> entry : listResults) {
      System.out.printf(
          "%s -> (%f, %d)\n",
          entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount());
    }

    //        JavaPairRDD<String, Double> resultRDD =
    //                daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() /
    // averageWrapper.getCount());
    //
    //        Map<String, Double> results = resultRDD.collectAsMap();

    //        List<Map.Entry<String, Double>> listResults = new ArrayList<>();
    //        listResults.addAll(results.entrySet());
    //        Collections.sort(listResults, (entry1, entry2) ->
    // entry1.getValue().compareTo(entry2.getValue()));
    //
    //        for (Map.Entry<String, Double> entry : listResults) {
    //            System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue());
    //        }
  }
  public static void main(String[] args) {
    String master = args[0];
    String appName = args[1];
    String path = args[2];

    SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<String> lines =
        sc.textFile(path)
            .filter(
                new Function<String, Boolean>() {
                  @Override
                  public Boolean call(String s) throws Exception {
                    return !s.isEmpty() && !s.contains("Total");
                  }
                });

    JavaRDD<String> usOnly =
        lines.filter(
            new Function<String, Boolean>() {
              @Override
              public Boolean call(String s) throws Exception {
                return s.contains("United States");
              }
            });

    JavaPairRDD<String, Integer> yearAndMedals =
        usOnly.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) throws Exception {
                String[] fields = s.split(",");
                return new Tuple2<String, Integer>(fields[3], Integer.parseInt(fields[8]));
              }
            });

    JavaPairRDD<String, Integer> reduced =
        yearAndMedals.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer accumulator, Integer currentValue) throws Exception {
                return accumulator + currentValue;
              }
            });

    JavaPairRDD<String, Integer> result =
        reduced.filter(
            new Function<Tuple2<String, Integer>, Boolean>() {
              @Override
              public Boolean call(Tuple2<String, Integer> tuple) throws Exception {
                return tuple._2 < 200;
              }
            });

    System.out.println();
    System.out.println(result.collect());
  }