@Test
 public void testJavaFunctions1() throws Exception {
   SparkContext sc = mock(SparkContext.class);
   JavaSparkContext jsc = mock(JavaSparkContext.class);
   when(jsc.sc()).thenReturn(sc);
   SparkContextJavaFunctions scjf = javaFunctions(jsc);
   assertThat(scjf.sparkContext, is(jsc.sc()));
 }
 @Test
 public void testJavaSparkContextFunctions() throws Exception {
   SparkContext mockSparkContext = mock(SparkContext.class);
   JavaSparkContext mockJavaSparkContext = mock(JavaSparkContext.class);
   when(mockJavaSparkContext.sc()).thenReturn(mockSparkContext);
   GemFireJavaSparkContextFunctions wrapper = javaFunctions(mockJavaSparkContext);
   assertTrue(mockSparkContext == wrapper.sc);
 }
 public static synchronized JavaSparkContext getSparkContext(SparkPipelineOptions options) {
   SparkContextOptions contextOptions = options.as(SparkContextOptions.class);
   // reuse should be ignored if the context is provided.
   if (Boolean.getBoolean(TEST_REUSE_SPARK_CONTEXT)
       && !contextOptions.getUsesProvidedSparkContext()) {
     // if the context is null or stopped for some reason, re-create it.
     if (sparkContext == null || sparkContext.sc().isStopped()) {
       sparkContext = createSparkContext(contextOptions);
       sparkMaster = options.getSparkMaster();
     } else if (!options.getSparkMaster().equals(sparkMaster)) {
       throw new IllegalArgumentException(
           String.format(
               "Cannot reuse spark context "
                   + "with different spark master URL. Existing: %s, requested: %s.",
               sparkMaster, options.getSparkMaster()));
     }
     return sparkContext;
   } else {
     return createSparkContext(contextOptions);
   }
 }
 private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) {
   if (contextOptions.getUsesProvidedSparkContext()) {
     LOG.info("Using a provided Spark Context");
     JavaSparkContext jsc = contextOptions.getProvidedSparkContext();
     if (jsc == null || jsc.sc().isStopped()) {
       LOG.error("The provided Spark context " + jsc + " was not created or was stopped");
       throw new RuntimeException("The provided Spark context was not created or was stopped");
     }
     return jsc;
   } else {
     LOG.info("Creating a brand new Spark Context.");
     SparkConf conf = new SparkConf();
     if (!conf.contains("spark.master")) {
       // set master if not set.
       conf.setMaster(contextOptions.getSparkMaster());
     }
     conf.setAppName(contextOptions.getAppName());
     // register immutable collections serializers because the SDK uses them.
     conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName());
     conf.set("spark.serializer", KryoSerializer.class.getName());
     return new JavaSparkContext(conf);
   }
 }
Example #5
0
  public static void main(String[] args) {

    // Path de resultados
    String pathResults = "results";

    String pathToCategories = "values.txt";
    String pathToWords = "words.txt";
    File file = new File(pathToWords);

    HashMap<Double, String> categoriesDict = new HashMap<>();
    HashMap<String, String> resultado = new HashMap<>();

    FileInputStream fis = null;
    try {
      fis = new FileInputStream(pathToCategories);
      // Construct BufferedReader from InputStreamReader
      BufferedReader br = new BufferedReader(new InputStreamReader(fis));

      String line = null;
      while ((line = br.readLine()) != null) {
        String[] words = line.split(" ");
        categoriesDict.put(Double.valueOf(words[0]), words[1]);
      }
      br.close();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }

    // Path donde estaran las categorias
    String pathCategories = "src/main/resources/categoriestest/";

    // Configuracion basica de la aplicacion
    SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesTest").setMaster("local[*]");

    // Creacion del contexto
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    NaiveBayesModel model = NaiveBayesModel.load(jsc.sc(), pathResults);

    HashMap<String, String> dictionary = loadDictionary();

    JavaRDD<String> fileWords = null;

    if (file.exists()) {
      JavaRDD<String> input = jsc.textFile(pathToWords);
      fileWords =
          input.flatMap(
              new FlatMapFunction<String, String>() {
                @Override
                public Iterable<String> call(String s) throws Exception {
                  return Arrays.asList(s.split(" "));
                }
              });
    } else {
      System.out.println("Error, there is no words");
      System.exit(-1);
    }
    ArrayList<String> aFileWords = (ArrayList<String>) fileWords.collect();

    // Cogemos el fichero en el que se encuentran las categorias
    File dir = new File(pathCategories);
    for (File f : dir.listFiles()) {
      JavaRDD<String> input = jsc.textFile(f.getPath());
      JavaRDD<String> words =
          input.flatMap(
              new FlatMapFunction<String, String>() {
                @Override
                public Iterable<String> call(String s) throws Exception {
                  return Arrays.asList(s.split(" "));
                }
              });
      JavaPairRDD<String, Double> wordCount = Reducer.parseWords(words, dictionary);
      List<Tuple2<String, Double>> total = wordCount.collect();
      List<Tuple2<String, Double>> elementsRemoved = new ArrayList<>();
      for (Tuple2<String, Double> t : total) {
        if (!t._1.equals("")) {
          elementsRemoved.add(new Tuple2<>(t._1, t._2 / wordCount.count()));
        }
      }
      ArrayList<Tuple2<String, Double>> freqFinal = new ArrayList<>();
      for (String s : aFileWords) {
        boolean found = false;
        for (Tuple2<String, Double> t : elementsRemoved) {
          if (t._1.equals(s)) {
            found = true;
            freqFinal.add(t);
            break;
          }
        }
        if (!found) {
          freqFinal.add(new Tuple2<String, Double>(s, 0.0));
        }
      }
      double[] v = new double[freqFinal.size()];
      for (int i = 0; i < freqFinal.size(); i++) {
        Tuple2<String, Double> t = freqFinal.get(i);
        v[i] = t._2;
      }
      org.apache.spark.mllib.linalg.Vector vector = Vectors.dense(v);
      /**/
      double d = model.predict(vector);
      System.out.println(categoriesDict.get(d));
      resultado.put(f.getName(), categoriesDict.get(d));
    }
    jsc.stop();
    try {
      Thread.sleep(2000);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    for (String key : resultado.keySet()) {
      System.out.println(key + " - " + resultado.get(key));
    }
  }
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setMaster("local").setAppName("SparkRecommendationEngine");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load and parse the data
    String path = "data/test.csv";
    JavaRDD<String> data = sc.textFile(path);
    JavaRDD<Rating> ratings =
        data.map(
            new Function<String, Rating>() {
              public Rating call(String s) {
                String[] sarray = s.split(",");
                return new Rating(
                    Integer.parseInt(sarray[0]),
                    Integer.parseInt(sarray[1]),
                    Double.parseDouble(sarray[2]));
              }
            });

    // Build the recommendation model using ALS
    int rank = 10;
    int numIterations = 10;
    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);

    // Evaluate the model on rating data
    JavaRDD<Tuple2<Object, Object>> userProducts =
        ratings.map(
            new Function<Rating, Tuple2<Object, Object>>() {
              public Tuple2<Object, Object> call(Rating r) {
                return new Tuple2<Object, Object>(r.user(), r.product());
              }
            });
    JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions =
        JavaPairRDD.fromJavaRDD(
            model
                .predict(JavaRDD.toRDD(userProducts))
                .toJavaRDD()
                .map(
                    new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
                      public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) {
                        return new Tuple2<Tuple2<Integer, Integer>, Double>(
                            new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
                      }
                    }));
    JavaRDD<Tuple2<Double, Double>> ratesAndPreds =
        JavaPairRDD.fromJavaRDD(
                ratings.map(
                    new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
                      public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) {
                        return new Tuple2<Tuple2<Integer, Integer>, Double>(
                            new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
                      }
                    }))
            .join(predictions)
            .values();
    double MSE =
        JavaDoubleRDD.fromRDD(
                ratesAndPreds
                    .map(
                        new Function<Tuple2<Double, Double>, Object>() {
                          public Object call(Tuple2<Double, Double> pair) {
                            Double err = pair._1() - pair._2();
                            return err * err;
                          }
                        })
                    .rdd())
            .mean();
    System.out.println("Mean Squared Error = " + MSE);

    // Save and load model
    model.save(sc.sc(), "myModelPath");
    MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(sc.sc(), "myModelPath");
  }
  public static void main(String[] args) {
    // parse the arguments
    Params params = parse(args);
    SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // configure the base classifier
    LogisticRegression classifier =
        new LogisticRegression()
            .setMaxIter(params.maxIter)
            .setTol(params.tol)
            .setFitIntercept(params.fitIntercept);

    if (params.regParam != null) {
      classifier.setRegParam(params.regParam);
    }
    if (params.elasticNetParam != null) {
      classifier.setElasticNetParam(params.elasticNetParam);
    }

    // instantiate the One Vs Rest Classifier
    OneVsRest ovr = new OneVsRest().setClassifier(classifier);

    String input = params.input;
    RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input);
    RDD<LabeledPoint> train;
    RDD<LabeledPoint> test;

    // compute the train/ test split: if testInput is not provided use part of input
    String testInput = params.testInput;
    if (testInput != null) {
      train = inputData;
      // compute the number of features in the training set.
      int numFeatures = inputData.first().features().size();
      test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures);
    } else {
      double f = params.fracTest;
      RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[] {1 - f, f}, 12345);
      train = tmp[0];
      test = tmp[1];
    }

    // train the multiclass model
    DataFrame trainingDataFrame = jsql.createDataFrame(train, LabeledPoint.class);
    OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache());

    // score the model on test data
    DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class);
    DataFrame predictions = ovrModel.transform(testDataFrame.cache()).select("prediction", "label");

    // obtain metrics
    MulticlassMetrics metrics = new MulticlassMetrics(predictions);
    StructField predictionColSchema = predictions.schema().apply("prediction");
    Integer numClasses = (Integer) MetadataUtils.getNumClasses(predictionColSchema).get();

    // compute the false positive rate per label
    StringBuilder results = new StringBuilder();
    results.append("label\tfpr\n");
    for (int label = 0; label < numClasses; label++) {
      results.append(label);
      results.append("\t");
      results.append(metrics.falsePositiveRate((double) label));
      results.append("\n");
    }

    Matrix confusionMatrix = metrics.confusionMatrix();
    // output the Confusion Matrix
    System.out.println("Confusion Matrix");
    System.out.println(confusionMatrix);
    System.out.println();
    System.out.println(results);

    jsc.stop();
  }