@SuppressWarnings("unchecked") @Test public void testFoldLeftValueComparator() { List<Tuple2<Integer, TimeValue>> pairs = Lists.newArrayList( tuple2(5, new TimeValue(2, 0.5)), tuple2(1, new TimeValue(1, 1.2)), tuple2(5, new TimeValue(1, 1.0)), tuple2(1, new TimeValue(2, 2.0)), tuple2(1, new TimeValue(3, 3.0))); JavaPairRDD<Integer, TimeValue> p = jsc().parallelizePairs(pairs); GroupSorted<Integer, TimeValue> gs = new GroupSorted(p, new HashPartitioner(2), new TimeValueComparator()); JavaPairRDD<Integer, Double> emas = gs.foldLeftByKey( 0.0, new Function2<Double, TimeValue, Double>() { public Double call(Double acc, TimeValue tv) { return 0.8 * acc + 0.2 * tv.getValue(); } }); System.out.println(ImmutableSet.copyOf(emas.collect())); Assert.assertTrue( ImmutableSet.copyOf(emas.collect()) .equals(ImmutableSet.of(tuple2(1, 1.0736), tuple2(5, 0.26)))); }
@SuppressWarnings("serial") @Override public SortedCounts<String> execute(final JavaSparkContext spark) { final JavaRDD<String> textFile = spark.textFile(inputFile); final JavaRDD<String> words = textFile.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(final String rawJSON) throws TwitterException { final Status tweet = TwitterObjectFactory.createStatus(rawJSON); String text = tweet.getText(); return Arrays.asList(text.split(" ")); } }); final JavaPairRDD<String, Integer> pairs = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(final String s) { return new Tuple2<String, Integer>(s.toLowerCase(), 1); } }); final JavaPairRDD<String, Integer> counts = pairs.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(final Integer a, final Integer b) { return a + b; } }); return SortedCounts.create(counts); }
/** * This method builds a decision tree model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildDecisionTreeModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures, Map<Integer, Integer> categoricalFeatureInfo) throws MLModelBuilderException { try { Map<String, String> hyperParameters = workflow.getHyperParameters(); DecisionTree decisionTree = new DecisionTree(); DecisionTreeModel decisionTreeModel = decisionTree.train( trainingData, getNoOfClasses(mlModel), categoricalFeatureInfo, hyperParameters.get(MLConstants.IMPURITY), Integer.parseInt(hyperParameters.get(MLConstants.MAX_DEPTH)), Integer.parseInt(hyperParameters.get(MLConstants.MAX_BINS))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); JavaPairRDD<Double, Double> predictionsAndLabels = decisionTree.test(decisionTreeModel, testingData).cache(); ClassClassificationAndRegressionModelSummary classClassificationAndRegressionModelSummary = SparkModelUtils.getClassClassificationModelSummary( sparkContext, testingData, predictionsAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLDecisionTreeModel(decisionTreeModel)); classClassificationAndRegressionModelSummary.setFeatures( includedFeatures.values().toArray(new String[0])); classClassificationAndRegressionModelSummary.setAlgorithm( SUPERVISED_ALGORITHM.DECISION_TREE.toString()); MulticlassMetrics multiclassMetrics = getMulticlassMetrics(sparkContext, predictionsAndLabels); predictionsAndLabels.unpersist(); classClassificationAndRegressionModelSummary.setMulticlassConfusionMatrix( getMulticlassConfusionMatrix(multiclassMetrics, mlModel)); Double modelAccuracy = getModelAccuracy(multiclassMetrics); classClassificationAndRegressionModelSummary.setModelAccuracy(modelAccuracy); classClassificationAndRegressionModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return classClassificationAndRegressionModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building decision tree model: " + e.getMessage(), e); } }
@Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException, DMLUnsupportedOperationException { SparkExecutionContext sec = (SparkExecutionContext) ec; // get rdd and broadcast inputs JavaPairRDD<MatrixIndexes, MatrixBlock> inX = sec.getBinaryBlockRDDHandleForVariable(_input1.getName()); PartitionedBroadcastMatrix inV = sec.getBroadcastForVariable(_input2.getName()); // execute mapmmchain (guaranteed to have single output block) MatrixBlock out = null; if (_chainType == ChainType.XtXv) { RDDMapMMChainFunction fmmc = new RDDMapMMChainFunction(inV); JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = inX.mapValues(fmmc); out = RDDAggregateUtils.sumStable(tmp); } else { // ChainType.XtwXv PartitionedBroadcastMatrix inW = sec.getBroadcastForVariable(_input3.getName()); RDDMapMMChainFunction2 fmmc = new RDDMapMMChainFunction2(inV, inW); JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = inX.mapToPair(fmmc); out = RDDAggregateUtils.sumStable(tmp); } // put output block into symbol table (no lineage because single block) // this also includes implicit maintenance of matrix characteristics sec.setMatrixOutput(_output.getName(), out); }
@Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException, DMLUnsupportedOperationException { SparkExecutionContext sec = (SparkExecutionContext) ec; String rddVar = (_type == CacheType.LEFT) ? input2.getName() : input1.getName(); String bcastVar = (_type == CacheType.LEFT) ? input1.getName() : input2.getName(); MatrixCharacteristics mc = sec.getMatrixCharacteristics(output.getName()); long rlen = sec.getScalarInput(_nrow.getName(), _nrow.getValueType(), _nrow.isLiteral()).getLongValue(); // get inputs JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar); PartitionedBroadcastMatrix in2 = sec.getBroadcastForVariable(bcastVar); // execute pmm instruction JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.flatMapToPair(new RDDPMMFunction(_type, in2, rlen, mc.getRowsPerBlock())); out = RDDAggregateUtils.sumByKeyStable(out); // put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), rddVar); sec.addLineageBroadcast(output.getName(), bcastVar); // update output statistics if not inferred updateBinaryMMOutputMatrixCharacteristics(sec, false); }
@Test public void wrapPairRDDFakeCtTest() { JavaInteropTestHelper helper = new JavaInteropTestHelper(sc()); JavaInterop ji = new JavaInterop(); RDD<Tuple2<String, Object>> rdd = helper.generateMiniPairRDD(); JavaPairRDD prdd = ji.wrapPairRDDFakeCt(rdd); List<Tuple2<String, Long>> expected = Arrays.asList(new Tuple2<String, Long>("panda", 12L)); assertEquals(expected, prdd.collect()); }
/** * @param pfid * @param program * @param taskFile * @param resultFile * @param enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException * @throws DMLUnsupportedOperationException */ public static RemoteParForJobReturn runJob( long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, ExecutionContext ec, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, // config params boolean enableCPCaching, int numReducers) // opt params throws DMLRuntimeException, DMLUnsupportedOperationException { String jobname = "ParFor-DPESP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext) ec; JavaSparkContext sc = sec.getSparkContext(); // prepare input parameters MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData(); MatrixCharacteristics mc = md.getMatrixCharacteristics(); InputInfo ii = InputInfo.BinaryBlockInputInfo; // initialize accumulators for tasks/iterations Accumulator<Integer> aTasks = sc.accumulator(0); Accumulator<Integer> aIters = sc.accumulator(0); JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar); DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf); RemoteDPParForSparkWorker efun = new RemoteDPParForSparkWorker( program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters); List<Tuple2<Long, String>> out = in.flatMapToPair(dpfun) // partition the input blocks .groupByKey(numReducers) // group partition blocks .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup .collect(); // get output handles // de-serialize results LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG); int numTasks = aTasks.value(); // get accumulator value int numIters = aIters.value(); // get accumulator value // create output symbol table entries RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results); // maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if (DMLScript.STATISTICS) { Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0); } return ret; }
public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext("local", "JavaAPISuite"); JavaRDD<String> lines = sc.textFile("log.txt"); JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" "))); JavaPairRDD<String, Integer> counts = words.mapToPair(w -> new Tuple2<String, Integer>(w, 1)).reduceByKey((x, y) -> x + y); counts.collect().forEach(t -> System.out.println("Key:" + t._1() + " Value:" + t._2())); }
/** * Class used to analyze themes life cycle. * * @param hmmInput The hmmInput from which is going to be used the background model, the lexicon * and the wordStream. Themes must be added before any analytics can be done. */ public LifeCycleAnalyserSpark(HmmInputFromParser hmmInput) { this.wordStream = hmmInput.wordStream; this.lexicon = hmmInput.lexicon; this.lexiconAsMap = lexicon.collectAsMap(); getInvertedLexicon(); numberOfThemes = 0L; numberOfWords = lexicon.count(); // themes = new ArrayList<double[]>(); setBackgroundModelAsThemebyId(hmmInput.backgroundModelById); }
/** * This method builds a naive bayes model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildNaiveBayesModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures) throws MLModelBuilderException { try { Map<String, String> hyperParameters = workflow.getHyperParameters(); NaiveBayesClassifier naiveBayesClassifier = new NaiveBayesClassifier(); NaiveBayesModel naiveBayesModel = naiveBayesClassifier.train( trainingData, Double.parseDouble(hyperParameters.get(MLConstants.LAMBDA))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); JavaPairRDD<Double, Double> predictionsAndLabels = naiveBayesClassifier.test(naiveBayesModel, testingData).cache(); ClassClassificationAndRegressionModelSummary classClassificationAndRegressionModelSummary = SparkModelUtils.getClassClassificationModelSummary( sparkContext, testingData, predictionsAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLClassificationModel(naiveBayesModel)); classClassificationAndRegressionModelSummary.setFeatures( includedFeatures.values().toArray(new String[0])); classClassificationAndRegressionModelSummary.setAlgorithm( SUPERVISED_ALGORITHM.NAIVE_BAYES.toString()); MulticlassMetrics multiclassMetrics = getMulticlassMetrics(sparkContext, predictionsAndLabels); predictionsAndLabels.unpersist(); classClassificationAndRegressionModelSummary.setMulticlassConfusionMatrix( getMulticlassConfusionMatrix(multiclassMetrics, mlModel)); Double modelAccuracy = getModelAccuracy(multiclassMetrics); classClassificationAndRegressionModelSummary.setModelAccuracy(modelAccuracy); classClassificationAndRegressionModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return classClassificationAndRegressionModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building naive bayes model: " + e.getMessage(), e); } }
@SuppressWarnings("unchecked") @Test public void testGroupSortNoEffect() { List<Tuple2<Integer, Integer>> pairs = Lists.newArrayList(tuple2(1, 2), tuple2(2, 3), tuple2(1, 3), tuple2(3, 1), tuple2(2, 1)); JavaPairRDD<Integer, Integer> p = jsc().parallelizePairs(pairs); GroupSorted<Integer, Integer> gs = new GroupSorted(p, new HashPartitioner(2), Ordering.natural()); GroupSorted<Integer, Integer> gs1 = new GroupSorted(gs, new HashPartitioner(2), Ordering.natural()); Assert.assertTrue(JavaPairRDD.toRDD(gs) == JavaPairRDD.toRDD(gs1)); }
public static JavaRDD<IAtomContainer> sdfFilesToMols(String path, JavaSparkContext ctx) { JavaPairRDD<String, String> sdfFiles = ctx.wholeTextFiles(path); FlatMapFunction<Tuple2<String, String>, IAtomContainer> sdfBlockBuilder = new FlatMapFunction<Tuple2<String, String>, IAtomContainer>() { public Iterable<IAtomContainer> call(Tuple2<String, String> sdfFile) throws Exception { return SDFUtils.parseSDF(sdfFile._2()); } }; JavaRDD<IAtomContainer> molecules = sdfFiles.flatMap(sdfBlockBuilder); return molecules; }
private void setBackgroundModelAsThemebyId(JavaPairRDD<Long, Double> backgroundModelById) { List<Tuple2<Long, Double>> bgCollected = backgroundModelById.collect(); bgAsArray = new double[(int) numberOfWords]; for (Tuple2<Long, Double> tuple : bgCollected) { bgAsArray[tuple._1.intValue()] = tuple._2; } }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaWordCount <master> <file>"); System.exit(1); } JavaSparkContext ctx = new JavaSparkContext( args[0], "JavaWordCount", System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaWordCount.class)); JavaRDD<String> lines = ctx.textFile(args[1], 1); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1 + ": " + tuple._2); } System.exit(0); }
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: JavaWordCount <input_file> <output_file>"); System.exit(1); } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) { return Arrays.asList(SPACE.split(s)).iterator(); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); /* List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?,?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } */ counts.saveAsTextFile(args[1]); ctx.stop(); }
@Test @SuppressWarnings("unchecked") public void testJavaPairRDDFunctions() throws Exception { JavaPairRDD<String, Integer> mockPairRDD = mock(JavaPairRDD.class); RDD<Tuple2<String, Integer>> mockTuple2RDD = mock(RDD.class); when(mockPairRDD.rdd()).thenReturn(mockTuple2RDD); GemFireJavaPairRDDFunctions wrapper = javaFunctions(mockPairRDD); assertTrue(mockTuple2RDD == wrapper.rddf.rdd()); Tuple3<SparkContext, GemFireConnectionConf, GemFireConnection> tuple3 = createCommonMocks(); when(mockTuple2RDD.sparkContext()).thenReturn(tuple3._1()); String regionPath = "testregion"; wrapper.saveToGemfire(regionPath, tuple3._2()); verify(mockTuple2RDD, times(1)).sparkContext(); verify(tuple3._1(), times(1)) .runJob(eq(mockTuple2RDD), any(Function2.class), any(ClassTag.class)); }
@Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException, DMLUnsupportedOperationException { SparkExecutionContext sec = (SparkExecutionContext) ec; // get input JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName()); // execute unary builtin operation UnaryOperator uop = (UnaryOperator) _optr; JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new RDDMatrixBuiltinUnaryOp(uop)); // set output RDD updateUnaryOutputMatrixCharacteristics(sec); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); }
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1); } SparkSession spark = SparkSession.builder().appName("JavaWordCount").getOrCreate(); JavaRDD<String> lines = spark.read().text(args[0]).javaRDD(); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) { return Arrays.asList(SPACE.split(s)).iterator(); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } spark.stop(); }
public static <T, N extends Number> SortedCounts<T> create(final JavaPairRDD<T, N> rdd) { final SortedSet<Entry<T>> sortedEntries = new TreeSet<>(); for (final Tuple2<T, N> tuple : rdd.collect()) { sortedEntries.add(new Entry<T>(tuple._2.longValue(), tuple._1)); } final SortedCounts<T> result = new SortedCounts<T>(sortedEntries); return result; }
public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: NaiveBayesExample <training_data> <test_data>"); System.exit(1); } String training_data_path = args[0]; // https://class.coursera.org/cloudapplications-001/forum/thread?thread_id=1387 // String test_data_path = args[0]; String test_data_path = args[1]; SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesExample"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<LabeledPoint> train = sc.textFile(training_data_path).map(new DataToPoint()); // JavaRDD<LabeledPoint> test = sc.textFile(training_data_path).map(new DataToPoint()); JavaRDD<LabeledPoint> test = sc.textFile(test_data_path).map(new DataToPoint()); final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0); JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair( new PairFunction<LabeledPoint, Double, Double>() { public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<Double, Double>(model.predict(p.features()), p.label()); } }); double accuracy = predictionAndLabel .filter( new Function<Tuple2<Double, Double>, Boolean>() { public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); } }) .count() / (double) test.count(); System.out.println(accuracy); sc.stop(); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local").setAppName("My App"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("src/main/resources/data.txt"); @SuppressWarnings("serial") JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + "-> " + tuple._2()); } sc.close(); }
@Test public void matchCitations() { // given when(inputCitationReader.readCitations(sparkContext, "/input/cit/path")).thenReturn(citations); when(inputCitationConverter.convertCitations(citations)).thenReturn(convertedCitations); when(convertedCitations.partitions()).thenReturn(Lists.newArrayList(16)); when(convertedCitations.partitionBy(any())).thenReturn(repartitionedCitations); when(inputDocumentReader.readDocuments(sparkContext, "/input/doc/path")).thenReturn(documents); when(inputDocumentConverter.convertDocuments(documents)).thenReturn(convertedDocuments); when(convertedDocuments.partitions()).thenReturn(Lists.newArrayList(12)); when(convertedDocuments.partitionBy(any())).thenReturn(repartitionedDocuments); when(coreCitationMatchingService.matchCitations(repartitionedCitations, repartitionedDocuments)) .thenReturn(matched); when(outputConverter.convertMatchedCitations(matched)).thenReturn(convertedMatched); // execute citationMatchingService.matchCitations( sparkContext, "/input/cit/path", "/input/doc/path", "/output/path"); // assert verify(inputCitationReader).readCitations(sparkContext, "/input/cit/path"); verify(inputCitationConverter).convertCitations(citations); verify(convertedCitations).partitionBy(citationsPartitioner.capture()); assertPartitioner(citationsPartitioner.getValue(), 5); verify(inputDocumentReader).readDocuments(sparkContext, "/input/doc/path"); verify(inputDocumentConverter).convertDocuments(documents); verify(convertedDocuments).partitionBy(documentsPartitioner.capture()); assertPartitioner(documentsPartitioner.getValue(), 5); verify(coreCitationMatchingService) .matchCitations(repartitionedCitations, repartitionedDocuments); verify(outputConverter).convertMatchedCitations(matched); verify(outputWriter).writeMatchedCitations(convertedMatched, "/output/path"); }
@SuppressWarnings("unchecked") @Test public void testMapStreamByKeyValueComparator() { List<Tuple2<String, Integer>> pairs = Lists.newArrayList( tuple2("a", 1), tuple2("b", 10), tuple2("a", 3), tuple2("b", 1), tuple2("c", 5)); JavaPairRDD<String, Integer> p = jsc().parallelizePairs(pairs); GroupSorted<String, Integer> gs = new GroupSorted(p, new HashPartitioner(2), Ordering.natural().reverse()); JavaPairRDD<String, Integer> max = gs.mapStreamByKey( new Function<Iterator<Integer>, Iterator<Integer>>() { public Iterator<Integer> call(Iterator<Integer> it) { return Iterators.singletonIterator(it.next()); } }); Assert.assertTrue( ImmutableSet.copyOf(max.collect()) .equals(ImmutableSet.of(tuple2("a", 3), tuple2("b", 10), tuple2("c", 5)))); }
@Override public void publishAdditionalModelData( JavaSparkContext sparkContext, PMML pmml, JavaRDD<String> newData, JavaRDD<String> pastData, Path modelParentPath, QueueProducer<String, String> modelUpdateQueue) { JavaRDD<String> allData = pastData == null ? newData : newData.union(pastData); log.info("Sending user / X data as model updates"); String xPathString = PMMLUtils.getExtensionValue(pmml, "X"); JavaPairRDD<Integer, double[]> userRDD = fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString))); if (noKnownItems) { userRDD.foreach(new EnqueueFeatureVecsFn("X", modelUpdateQueue)); } else { log.info("Sending known item data with model updates"); JavaPairRDD<Integer, Collection<Integer>> knownItems = knownsRDD(allData, true); userRDD .join(knownItems) .foreach(new EnqueueFeatureVecsAndKnownItemsFn("X", modelUpdateQueue)); } log.info("Sending item / Y data as model updates"); String yPathString = PMMLUtils.getExtensionValue(pmml, "Y"); JavaPairRDD<Integer, double[]> productRDD = fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString))); // For now, there is no use in sending known users for each item // if (noKnownItems) { productRDD.foreach(new EnqueueFeatureVecsFn("Y", modelUpdateQueue)); // } else { // log.info("Sending known user data with model updates"); // JavaPairRDD<Integer,Collection<Integer>> knownUsers = knownsRDD(allData, false); // productRDD.join(knownUsers).foreach( // new EnqueueFeatureVecsAndKnownItemsFn("Y", modelUpdateQueue)); // } }
public static void wordCountJava8(String filename) { // Define a configuration to use to interact with Spark SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App"); // Create a Java version of the Spark Context from the configuration JavaSparkContext sc = new JavaSparkContext(conf); // Load the input data, which is a text file read from the command line JavaRDD<String> input = sc.textFile(filename); // Java 8 with lambdas: split the input string into words JavaRDD<String> words = input.flatMap(s -> Arrays.asList(s.split(" "))); // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count // them JavaPairRDD<String, Integer> counts = words.mapToPair(t -> new Tuple2(t, 1)).reduceByKey((x, y) -> (int) x + (int) y); // Save the word count back out to a text file, causing evaluation. counts.saveAsTextFile("output"); }
private static void printKeyValStat(JavaRDD<LabeledPoint> rdd) { JavaPairRDD<Double, Integer> mapRDD = rdd.mapToPair( new PairFunction<LabeledPoint, Double, Integer>() { public Tuple2<Double, Integer> call(LabeledPoint point) throws Exception { return new Tuple2<Double, Integer>(point.label(), 1); } }); Map<Double, Iterable<Integer>> map = mapRDD.groupByKey().collectAsMap(); System.out.printf("Number of records for labels: "); for (Double key : map.keySet()) { int count = 0; for (int val : map.get(key)) { count += val; } System.out.printf(" %f::%d \t ", key, count); } System.out.printf("\n"); }
private void getInvertedLexicon() { this.invertedLexicon = lexicon.mapToPair( new PairFunction<Tuple2<String, Long>, Long, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Long, String> call(Tuple2<String, Long> wordEntry) throws Exception { return new Tuple2<Long, String>(wordEntry._2, wordEntry._1); } }); }
public static void main(String[] args) { SparkConf sparkconf = new SparkConf() .setAppName("Simple Application") .setMaster("spark://1.245.77.10:7077") .set( "spark.driver.extraClassPath", "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*") .set( "spark.executor.extraClassPath", "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*") .set("fs.default.name", "file:///"); JavaSparkContext sc = new JavaSparkContext(sparkconf); Configuration hadoopConfig = sc.hadoopConfiguration(); hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw-0.2.jar"); // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw4spark-0.2.jar"); /*JavaRDD<Double> matrdd2 = sc.parallelize(Arrays.asList(1.0, 3.0, 2.0)); System.out.println("Start counting parallelize..."); long values = matrdd2.count(); System.out.println("Value count of parallelize is " + values);*/ JavaPairRDD<Long, Double> matrdd = sc.newAPIHadoopFile( "e:/tmp/vecRow03_x256.mat", JMATFileInputFormat.class, Long.class, Double.class, hadoopConfig); System.out.println("Start job..."); long values = matrdd.count(); System.out.println("Value count of hadoop is " + values); sc.stop(); sc.close(); }
public static void main(String[] args) { if (args.length == 0) { System.err.println("Usage: Main <file>"); System.exit(1); } SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(args[0]); JavaPairRDD<String, Double> dayArrivalDelayPair = lines.flatMapToPair( line -> { String[] splitLine = line.split(SPLIT_PATTERN); String key = splitLine.length == 0 ? "" : splitLine[0]; Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]); return Arrays.asList(new Tuple2<>(key, value)); }); JavaPairRDD<String, AverageWrapper> dayAverageWrapper = dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1)); JavaPairRDD<String, AverageWrapper> daysValueCount = dayAverageWrapper.reduceByKey( (aw1, aw2) -> new AverageWrapper( aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount())); Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap(); List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>(); listResults.addAll(resultMap.entrySet()); Collections.sort( listResults, (entry1, entry2) -> Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue())); for (Map.Entry<String, AverageWrapper> entry : listResults) { System.out.printf( "%s -> (%f, %d)\n", entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount()); } // JavaPairRDD<String, Double> resultRDD = // daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() / // averageWrapper.getCount()); // // Map<String, Double> results = resultRDD.collectAsMap(); // List<Map.Entry<String, Double>> listResults = new ArrayList<>(); // listResults.addAll(results.entrySet()); // Collections.sort(listResults, (entry1, entry2) -> // entry1.getValue().compareTo(entry2.getValue())); // // for (Map.Entry<String, Double> entry : listResults) { // System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue()); // } }
public static void main(String[] args) { String master = args[0]; String appName = args[1]; String path = args[2]; SparkConf conf = new SparkConf().setAppName(appName).setMaster(master); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(path) .filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return !s.isEmpty() && !s.contains("Total"); } }); JavaRDD<String> usOnly = lines.filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return s.contains("United States"); } }); JavaPairRDD<String, Integer> yearAndMedals = usOnly.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { String[] fields = s.split(","); return new Tuple2<String, Integer>(fields[3], Integer.parseInt(fields[8])); } }); JavaPairRDD<String, Integer> reduced = yearAndMedals.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer accumulator, Integer currentValue) throws Exception { return accumulator + currentValue; } }); JavaPairRDD<String, Integer> result = reduced.filter( new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> tuple) throws Exception { return tuple._2 < 200; } }); System.out.println(); System.out.println(result.collect()); }