@Test public void testJavaFunctions1() throws Exception { SparkContext sc = mock(SparkContext.class); JavaSparkContext jsc = mock(JavaSparkContext.class); when(jsc.sc()).thenReturn(sc); SparkContextJavaFunctions scjf = javaFunctions(jsc); assertThat(scjf.sparkContext, is(jsc.sc())); }
@Test public void testJavaSparkContextFunctions() throws Exception { SparkContext mockSparkContext = mock(SparkContext.class); JavaSparkContext mockJavaSparkContext = mock(JavaSparkContext.class); when(mockJavaSparkContext.sc()).thenReturn(mockSparkContext); GemFireJavaSparkContextFunctions wrapper = javaFunctions(mockJavaSparkContext); assertTrue(mockSparkContext == wrapper.sc); }
public static synchronized JavaSparkContext getSparkContext(SparkPipelineOptions options) { SparkContextOptions contextOptions = options.as(SparkContextOptions.class); // reuse should be ignored if the context is provided. if (Boolean.getBoolean(TEST_REUSE_SPARK_CONTEXT) && !contextOptions.getUsesProvidedSparkContext()) { // if the context is null or stopped for some reason, re-create it. if (sparkContext == null || sparkContext.sc().isStopped()) { sparkContext = createSparkContext(contextOptions); sparkMaster = options.getSparkMaster(); } else if (!options.getSparkMaster().equals(sparkMaster)) { throw new IllegalArgumentException( String.format( "Cannot reuse spark context " + "with different spark master URL. Existing: %s, requested: %s.", sparkMaster, options.getSparkMaster())); } return sparkContext; } else { return createSparkContext(contextOptions); } }
private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) { if (contextOptions.getUsesProvidedSparkContext()) { LOG.info("Using a provided Spark Context"); JavaSparkContext jsc = contextOptions.getProvidedSparkContext(); if (jsc == null || jsc.sc().isStopped()) { LOG.error("The provided Spark context " + jsc + " was not created or was stopped"); throw new RuntimeException("The provided Spark context was not created or was stopped"); } return jsc; } else { LOG.info("Creating a brand new Spark Context."); SparkConf conf = new SparkConf(); if (!conf.contains("spark.master")) { // set master if not set. conf.setMaster(contextOptions.getSparkMaster()); } conf.setAppName(contextOptions.getAppName()); // register immutable collections serializers because the SDK uses them. conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName()); conf.set("spark.serializer", KryoSerializer.class.getName()); return new JavaSparkContext(conf); } }
public static void main(String[] args) { // Path de resultados String pathResults = "results"; String pathToCategories = "values.txt"; String pathToWords = "words.txt"; File file = new File(pathToWords); HashMap<Double, String> categoriesDict = new HashMap<>(); HashMap<String, String> resultado = new HashMap<>(); FileInputStream fis = null; try { fis = new FileInputStream(pathToCategories); // Construct BufferedReader from InputStreamReader BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String line = null; while ((line = br.readLine()) != null) { String[] words = line.split(" "); categoriesDict.put(Double.valueOf(words[0]), words[1]); } br.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // Path donde estaran las categorias String pathCategories = "src/main/resources/categoriestest/"; // Configuracion basica de la aplicacion SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesTest").setMaster("local[*]"); // Creacion del contexto JavaSparkContext jsc = new JavaSparkContext(sparkConf); NaiveBayesModel model = NaiveBayesModel.load(jsc.sc(), pathResults); HashMap<String, String> dictionary = loadDictionary(); JavaRDD<String> fileWords = null; if (file.exists()) { JavaRDD<String> input = jsc.textFile(pathToWords); fileWords = input.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); } else { System.out.println("Error, there is no words"); System.exit(-1); } ArrayList<String> aFileWords = (ArrayList<String>) fileWords.collect(); // Cogemos el fichero en el que se encuentran las categorias File dir = new File(pathCategories); for (File f : dir.listFiles()) { JavaRDD<String> input = jsc.textFile(f.getPath()); JavaRDD<String> words = input.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); JavaPairRDD<String, Double> wordCount = Reducer.parseWords(words, dictionary); List<Tuple2<String, Double>> total = wordCount.collect(); List<Tuple2<String, Double>> elementsRemoved = new ArrayList<>(); for (Tuple2<String, Double> t : total) { if (!t._1.equals("")) { elementsRemoved.add(new Tuple2<>(t._1, t._2 / wordCount.count())); } } ArrayList<Tuple2<String, Double>> freqFinal = new ArrayList<>(); for (String s : aFileWords) { boolean found = false; for (Tuple2<String, Double> t : elementsRemoved) { if (t._1.equals(s)) { found = true; freqFinal.add(t); break; } } if (!found) { freqFinal.add(new Tuple2<String, Double>(s, 0.0)); } } double[] v = new double[freqFinal.size()]; for (int i = 0; i < freqFinal.size(); i++) { Tuple2<String, Double> t = freqFinal.get(i); v[i] = t._2; } org.apache.spark.mllib.linalg.Vector vector = Vectors.dense(v); /**/ double d = model.predict(vector); System.out.println(categoriesDict.get(d)); resultado.put(f.getName(), categoriesDict.get(d)); } jsc.stop(); try { Thread.sleep(2000); } catch (InterruptedException e) { e.printStackTrace(); } for (String key : resultado.keySet()) { System.out.println(key + " - " + resultado.get(key)); } }
public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkRecommendationEngine"); JavaSparkContext sc = new JavaSparkContext(conf); // Load and parse the data String path = "data/test.csv"; JavaRDD<String> data = sc.textFile(path); JavaRDD<Rating> ratings = data.map( new Function<String, Rating>() { public Rating call(String s) { String[] sarray = s.split(","); return new Rating( Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]), Double.parseDouble(sarray[2])); } }); // Build the recommendation model using ALS int rank = 10; int numIterations = 10; MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01); // Evaluate the model on rating data JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map( new Function<Rating, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(Rating r) { return new Tuple2<Object, Object>(r.user(), r.product()); } }); JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD( model .predict(JavaRDD.toRDD(userProducts)) .toJavaRDD() .map( new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() { public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) { return new Tuple2<Tuple2<Integer, Integer>, Double>( new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating()); } })); JavaRDD<Tuple2<Double, Double>> ratesAndPreds = JavaPairRDD.fromJavaRDD( ratings.map( new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() { public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) { return new Tuple2<Tuple2<Integer, Integer>, Double>( new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating()); } })) .join(predictions) .values(); double MSE = JavaDoubleRDD.fromRDD( ratesAndPreds .map( new Function<Tuple2<Double, Double>, Object>() { public Object call(Tuple2<Double, Double> pair) { Double err = pair._1() - pair._2(); return err * err; } }) .rdd()) .mean(); System.out.println("Mean Squared Error = " + MSE); // Save and load model model.save(sc.sc(), "myModelPath"); MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(sc.sc(), "myModelPath"); }
public static void main(String[] args) { // parse the arguments Params params = parse(args); SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); // configure the base classifier LogisticRegression classifier = new LogisticRegression() .setMaxIter(params.maxIter) .setTol(params.tol) .setFitIntercept(params.fitIntercept); if (params.regParam != null) { classifier.setRegParam(params.regParam); } if (params.elasticNetParam != null) { classifier.setElasticNetParam(params.elasticNetParam); } // instantiate the One Vs Rest Classifier OneVsRest ovr = new OneVsRest().setClassifier(classifier); String input = params.input; RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input); RDD<LabeledPoint> train; RDD<LabeledPoint> test; // compute the train/ test split: if testInput is not provided use part of input String testInput = params.testInput; if (testInput != null) { train = inputData; // compute the number of features in the training set. int numFeatures = inputData.first().features().size(); test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures); } else { double f = params.fracTest; RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[] {1 - f, f}, 12345); train = tmp[0]; test = tmp[1]; } // train the multiclass model DataFrame trainingDataFrame = jsql.createDataFrame(train, LabeledPoint.class); OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache()); // score the model on test data DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class); DataFrame predictions = ovrModel.transform(testDataFrame.cache()).select("prediction", "label"); // obtain metrics MulticlassMetrics metrics = new MulticlassMetrics(predictions); StructField predictionColSchema = predictions.schema().apply("prediction"); Integer numClasses = (Integer) MetadataUtils.getNumClasses(predictionColSchema).get(); // compute the false positive rate per label StringBuilder results = new StringBuilder(); results.append("label\tfpr\n"); for (int label = 0; label < numClasses; label++) { results.append(label); results.append("\t"); results.append(metrics.falsePositiveRate((double) label)); results.append("\n"); } Matrix confusionMatrix = metrics.confusionMatrix(); // output the Confusion Matrix System.out.println("Confusion Matrix"); System.out.println(confusionMatrix); System.out.println(); System.out.println(results); jsc.stop(); }