public static void main(String[] args) { if (args.length == 0) { System.err.println("Usage: JavaTC <host> [<slices>]"); System.exit(1); } JavaSparkContext sc = new JavaSparkContext( args[0], "JavaTC", System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaTC.class)); Integer slices = (args.length > 1) ? Integer.parseInt(args[1]) : 2; JavaPairRDD<Integer, Integer> tc = sc.parallelizePairs(generateGraph(), slices).cache(); // Linear transitive closure: each round grows paths by one edge, // by joining the graph's edges with the already-discovered paths. // e.g. join the path (y, z) from the TC with the edge (x, y) from // the graph to obtain the path (x, z). // Because join() joins on keys, the edges are stored in reversed order. JavaPairRDD<Integer, Integer> edges = tc.mapToPair( new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) { return new Tuple2<Integer, Integer>(e._2(), e._1()); } }); long oldCount = 0; long nextCount = tc.count(); do { oldCount = nextCount; // Perform the join, obtaining an RDD of (y, (z, x)) pairs, // then project the result to obtain the new (x, z) paths. tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache(); nextCount = tc.count(); } while (nextCount != oldCount); System.out.println("TC has " + tc.count() + " edges."); System.exit(0); }
/** * Class used to analyze themes life cycle. * * @param hmmInput The hmmInput from which is going to be used the background model, the lexicon * and the wordStream. Themes must be added before any analytics can be done. */ public LifeCycleAnalyserSpark(HmmInputFromParser hmmInput) { this.wordStream = hmmInput.wordStream; this.lexicon = hmmInput.lexicon; this.lexiconAsMap = lexicon.collectAsMap(); getInvertedLexicon(); numberOfThemes = 0L; numberOfWords = lexicon.count(); // themes = new ArrayList<double[]>(); setBackgroundModelAsThemebyId(hmmInput.backgroundModelById); }
public static void main(String[] args) { SparkConf sparkconf = new SparkConf() .setAppName("Simple Application") .setMaster("spark://1.245.77.10:7077") .set( "spark.driver.extraClassPath", "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*") .set( "spark.executor.extraClassPath", "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*") .set("fs.default.name", "file:///"); JavaSparkContext sc = new JavaSparkContext(sparkconf); Configuration hadoopConfig = sc.hadoopConfiguration(); hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw-0.2.jar"); // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw4spark-0.2.jar"); /*JavaRDD<Double> matrdd2 = sc.parallelize(Arrays.asList(1.0, 3.0, 2.0)); System.out.println("Start counting parallelize..."); long values = matrdd2.count(); System.out.println("Value count of parallelize is " + values);*/ JavaPairRDD<Long, Double> matrdd = sc.newAPIHadoopFile( "e:/tmp/vecRow03_x256.mat", JMATFileInputFormat.class, Long.class, Double.class, hadoopConfig); System.out.println("Start job..."); long values = matrdd.count(); System.out.println("Value count of hadoop is " + values); sc.stop(); sc.close(); }
/** * A utility method to generate class classification model summary * * @param predictionsAndLabels Predictions and actual labels * @return Class classification model summary */ public static ClassClassificationAndRegressionModelSummary getClassClassificationModelSummary( JavaSparkContext sparkContext, JavaRDD<LabeledPoint> testingData, JavaPairRDD<Double, Double> predictionsAndLabels) { ClassClassificationAndRegressionModelSummary classClassificationModelSummary = new ClassClassificationAndRegressionModelSummary(); // store predictions and actuals List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>(); for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) { PredictedVsActual predictedVsActual = new PredictedVsActual(); predictedVsActual.setPredicted(scoreAndLabel._1()); predictedVsActual.setActual(scoreAndLabel._2()); predictedVsActuals.add(predictedVsActual); } // create a list of feature values List<double[]> features = new ArrayList<double[]>(); for (LabeledPoint labeledPoint : testingData.collect()) { if (labeledPoint != null && labeledPoint.features() != null) { double[] rowFeatures = labeledPoint.features().toArray(); features.add(rowFeatures); } } // create a list of feature values with predicted vs. actuals List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>(); for (int i = 0; i < features.size(); i++) { TestResultDataPoint testResultDataPoint = new TestResultDataPoint(); testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i)); testResultDataPoint.setFeatureValues(features.get(i)); testResultDataPoints.add(testResultDataPoint); } // covert List to JavaRDD JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD = sparkContext.parallelize(testResultDataPoints); // collect RDD as a sampled list List<TestResultDataPoint> testResultDataPointsSample; if (testResultDataPointsJavaRDD.count() > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) { testResultDataPointsSample = testResultDataPointsJavaRDD.takeSample( true, MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()); } else { testResultDataPointsSample = testResultDataPointsJavaRDD.collect(); } classClassificationModelSummary.setTestResultDataPointsSample(testResultDataPointsSample); classClassificationModelSummary.setPredictedVsActuals(predictedVsActuals); // calculate test error double error = 1.0 * predictionsAndLabels .filter( new Function<Tuple2<Double, Double>, Boolean>() { private static final long serialVersionUID = -3063364114286182333L; @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }) .count() / predictionsAndLabels.count(); classClassificationModelSummary.setError(error); return classClassificationModelSummary; }
public static void main(String[] args) { // Path de resultados String pathResults = "results"; String pathToCategories = "values.txt"; String pathToWords = "words.txt"; File file = new File(pathToWords); HashMap<Double, String> categoriesDict = new HashMap<>(); HashMap<String, String> resultado = new HashMap<>(); FileInputStream fis = null; try { fis = new FileInputStream(pathToCategories); // Construct BufferedReader from InputStreamReader BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String line = null; while ((line = br.readLine()) != null) { String[] words = line.split(" "); categoriesDict.put(Double.valueOf(words[0]), words[1]); } br.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // Path donde estaran las categorias String pathCategories = "src/main/resources/categoriestest/"; // Configuracion basica de la aplicacion SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesTest").setMaster("local[*]"); // Creacion del contexto JavaSparkContext jsc = new JavaSparkContext(sparkConf); NaiveBayesModel model = NaiveBayesModel.load(jsc.sc(), pathResults); HashMap<String, String> dictionary = loadDictionary(); JavaRDD<String> fileWords = null; if (file.exists()) { JavaRDD<String> input = jsc.textFile(pathToWords); fileWords = input.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); } else { System.out.println("Error, there is no words"); System.exit(-1); } ArrayList<String> aFileWords = (ArrayList<String>) fileWords.collect(); // Cogemos el fichero en el que se encuentran las categorias File dir = new File(pathCategories); for (File f : dir.listFiles()) { JavaRDD<String> input = jsc.textFile(f.getPath()); JavaRDD<String> words = input.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); JavaPairRDD<String, Double> wordCount = Reducer.parseWords(words, dictionary); List<Tuple2<String, Double>> total = wordCount.collect(); List<Tuple2<String, Double>> elementsRemoved = new ArrayList<>(); for (Tuple2<String, Double> t : total) { if (!t._1.equals("")) { elementsRemoved.add(new Tuple2<>(t._1, t._2 / wordCount.count())); } } ArrayList<Tuple2<String, Double>> freqFinal = new ArrayList<>(); for (String s : aFileWords) { boolean found = false; for (Tuple2<String, Double> t : elementsRemoved) { if (t._1.equals(s)) { found = true; freqFinal.add(t); break; } } if (!found) { freqFinal.add(new Tuple2<String, Double>(s, 0.0)); } } double[] v = new double[freqFinal.size()]; for (int i = 0; i < freqFinal.size(); i++) { Tuple2<String, Double> t = freqFinal.get(i); v[i] = t._2; } org.apache.spark.mllib.linalg.Vector vector = Vectors.dense(v); /**/ double d = model.predict(vector); System.out.println(categoriesDict.get(d)); resultado.put(f.getName(), categoriesDict.get(d)); } jsc.stop(); try { Thread.sleep(2000); } catch (InterruptedException e) { e.printStackTrace(); } for (String key : resultado.keySet()) { System.out.println(key + " - " + resultado.get(key)); } }