public static void main(String[] args) { if (args.length == 0) { System.err.println("Usage: JavaTC <host> [<slices>]"); System.exit(1); } JavaSparkContext sc = new JavaSparkContext( args[0], "JavaTC", System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaTC.class)); Integer slices = (args.length > 1) ? Integer.parseInt(args[1]) : 2; JavaPairRDD<Integer, Integer> tc = sc.parallelizePairs(generateGraph(), slices).cache(); // Linear transitive closure: each round grows paths by one edge, // by joining the graph's edges with the already-discovered paths. // e.g. join the path (y, z) from the TC with the edge (x, y) from // the graph to obtain the path (x, z). // Because join() joins on keys, the edges are stored in reversed order. JavaPairRDD<Integer, Integer> edges = tc.mapToPair( new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) { return new Tuple2<Integer, Integer>(e._2(), e._1()); } }); long oldCount = 0; long nextCount = tc.count(); do { oldCount = nextCount; // Perform the join, obtaining an RDD of (y, (z, x)) pairs, // then project the result to obtain the new (x, z) paths. tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache(); nextCount = tc.count(); } while (nextCount != oldCount); System.out.println("TC has " + tc.count() + " edges."); System.exit(0); }
@Override public void publishAdditionalModelData( JavaSparkContext sparkContext, PMML pmml, JavaRDD<String> newData, JavaRDD<String> pastData, Path modelParentPath, QueueProducer<String, String> modelUpdateQueue) { JavaRDD<String> allData = pastData == null ? newData : newData.union(pastData); log.info("Sending user / X data as model updates"); String xPathString = PMMLUtils.getExtensionValue(pmml, "X"); JavaPairRDD<Integer, double[]> userRDD = fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString))); if (noKnownItems) { userRDD.foreach(new EnqueueFeatureVecsFn("X", modelUpdateQueue)); } else { log.info("Sending known item data with model updates"); JavaPairRDD<Integer, Collection<Integer>> knownItems = knownsRDD(allData, true); userRDD .join(knownItems) .foreach(new EnqueueFeatureVecsAndKnownItemsFn("X", modelUpdateQueue)); } log.info("Sending item / Y data as model updates"); String yPathString = PMMLUtils.getExtensionValue(pmml, "Y"); JavaPairRDD<Integer, double[]> productRDD = fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString))); // For now, there is no use in sending known users for each item // if (noKnownItems) { productRDD.foreach(new EnqueueFeatureVecsFn("Y", modelUpdateQueue)); // } else { // log.info("Sending known user data with model updates"); // JavaPairRDD<Integer,Collection<Integer>> knownUsers = knownsRDD(allData, false); // productRDD.join(knownUsers).foreach( // new EnqueueFeatureVecsAndKnownItemsFn("Y", modelUpdateQueue)); // } }
public static void main(String[] args) throws IOException { Parameters param = new Parameters(); long initTime = System.currentTimeMillis(); SparkConf conf = new SparkConf().setAppName("StarJoin"); JavaSparkContext sc = new JavaSparkContext(conf); if (param.useKryo) { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName()); conf.set("spark.kryoserializer.buffer.mb", param.buffer); } MyBloomFilter.BloomFilter<String> BFS = new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes); MyBloomFilter.BloomFilter<String> BFD = new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes); MyBloomFilter.BloomFilter<String> BFC = new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes); JavaPairRDD<String, String> supps = sc.textFile(param.suppPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> s = supps.collect(); for (int i = 0; i < s.size(); i++) { BFS.add(s.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS); JavaPairRDD<String, String> custs = sc.textFile(param.custPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> c = custs.collect(); for (int i = 0; i < c.size(); i++) { BFC.add(c.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC); JavaPairRDD<String, String> dates = sc.textFile(param.datePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[6].equals("Dec1997"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[4]); } }); List<Tuple2<String, String>> d = dates.collect(); for (int i = 0; i < d.size(); i++) { BFD.add(d.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD); JavaPairRDD<String, String[]> lines = sc.textFile(param.linePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return varC.value().contains(s[2].getBytes()) & varS.value().contains(s[4].getBytes()) & varD.value().contains(s[5].getBytes()); } }) .mapToPair( new PairFunction<String[], String, String[]>() { public Tuple2<String, String[]> call(String[] s) { String[] v = {s[2], s[5], s[12]}; return new Tuple2<String, String[]>(s[4], v); } }); JavaPairRDD<String, String[]> result = lines .join(supps) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); result = result .join(custs) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); JavaPairRDD<String, Long> final_result = result .join(dates) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) { return new Tuple2<String, Long>( s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0])); } }) .reduceByKey( new Function2<Long, Long, Long>() { public Long call(Long i1, Long i2) { return i1 + i2; } }); JavaPairRDD<String, String> sub_result = final_result.mapToPair( new PairFunction<Tuple2<String, Long>, String, String>() { public Tuple2<String, String> call(Tuple2<String, Long> line) { return new Tuple2(line._1 + "," + line._2.toString(), null); } }); final_result = sub_result .sortByKey(new Q3Comparator()) .mapToPair( new PairFunction<Tuple2<String, String>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, String> line) { String[] s = line._1.split(","); return new Tuple2<String, Long>( s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3])); } }); Configuration HDFSconf = new Configuration(); FileSystem fs = FileSystem.get(HDFSconf); fs.delete(new Path(param.output), true); final_result.saveAsTextFile(param.output); long finalTime = System.currentTimeMillis(); System.out.print("Tempo total(ms): "); System.out.println(finalTime - initTime); sc.close(); }
/** * Produce, train and decode the Hmm. Must be done after adding theme and before calculating any * strengths. * * @param sc the spark context * @param piThreshold Threshold on pi * @param aaThreshold Threshold on a * @param maxIterations Max number of iterations */ public void analyse( JavaSparkContext sc, double piThreshold, double aaThreshold, int maxIterations) { int numberHiddenStates = (int) (numberOfThemes + 1); int numberObservableOutputSymbols = (int) numberOfWords; // setting up initial state probability distribution double[] pi = new double[numberHiddenStates]; double initialStateDistribution = 1.0f / numberHiddenStates; for (int i = 0; i < numberHiddenStates; i++) { pi[i] = initialStateDistribution; } // setting up state transition probability distribution double[][] stateTransitionProbabilityDistribution = new double[numberHiddenStates][numberObservableOutputSymbols]; double halfInitialStateDistribution = initialStateDistribution / 2.0; for (int i = 0; i < numberHiddenStates; i++) { for (int j = 0; j < numberObservableOutputSymbols; j++) { if (i == 0 && j == 0) { // .5 chance to stay in the background model stateTransitionProbabilityDistribution[i][j] = 0.5f; } else if (i == 0) { // equal chance to go from the background model to any other state stateTransitionProbabilityDistribution[i][j] = halfInitialStateDistribution; } else if (j == 0) { // .5 chance to return in the background model stateTransitionProbabilityDistribution[i][j] = 0.5f; } else if (i == j) { // .5 chance to return in the same state stateTransitionProbabilityDistribution[i][j] = 0.5f; } else { // no chance to go between any two different states stateTransitionProbabilityDistribution[i][j] = 0.0f; } } } // setting the output prob distribution outputProbabilityDistribution[0] = bgAsArray; // setting up and training the hmm if (outputProbabilityDistribution == null) { System.out.println( "error : you need to specify the themes via" + "addAllThemesFromRDD before analyzing the sequence!"); } hmm = new Hmm2( numberHiddenStates, numberObservableOutputSymbols, pi, stateTransitionProbabilityDistribution, outputProbabilityDistribution); if (outputProbabilityDistribution == null) { System.out.println( "error : you need to specify the themes via addAllThemesFromRDD before analyzing the sequence!"); } JavaPairRDD<Tuple2<Long, Long>, Long> wordStreamZippedWithIndex = wordStream.zipWithIndex(); JavaRDD<Tuple2<Integer, Integer>> observedSequenceRdd = wordStreamZippedWithIndex.map( new Function<Tuple2<Tuple2<Long, Long>, Long>, Tuple2<Integer, Integer>>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Integer, Integer> call(Tuple2<Tuple2<Long, Long>, Long> wordEntry) throws Exception { return new Tuple2<Integer, Integer>( wordEntry._2.intValue(), wordEntry._1._1.intValue()); } }); System.out.println("observedSequenceRdd length : " + observedSequenceRdd.count()); System.out.println( "observedSequenceRdd : " + Arrays.toString(Arrays.copyOf(observedSequenceRdd.collect().toArray(), 50))); hmm.rawTrain(sc, observedSequenceRdd, piThreshold, aaThreshold, maxIterations); mostLikelySequenceThemeShifts = hmm.decode(sc, observedSequenceRdd); JavaPairRDD<Integer, Tuple2<Long, Long>> wordStreamZippedWithIndexReversed = wordStreamZippedWithIndex.mapToPair( new PairFunction<Tuple2<Tuple2<Long, Long>, Long>, Integer, Tuple2<Long, Long>>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Integer, Tuple2<Long, Long>> call(Tuple2<Tuple2<Long, Long>, Long> arg0) throws Exception { // TODO Auto-generated method stub return new Tuple2<Integer, Tuple2<Long, Long>>(arg0._2.intValue(), arg0._1); } }); JavaPairRDD<Integer, Tuple2<Integer, Tuple2<Long, Long>>> zippedDecodedSequence = mostLikelySequenceThemeShifts.join(wordStreamZippedWithIndexReversed); JavaRDD<Tuple2<Long, Integer>> nonZeroMostLikelyByTimestamp = zippedDecodedSequence.flatMap( new FlatMapFunction< Tuple2<Integer, Tuple2<Integer, Tuple2<Long, Long>>>, Tuple2<Long, Integer>>() { @Override public Iterable<Tuple2<Long, Integer>> call( Tuple2<Integer, Tuple2<Integer, Tuple2<Long, Long>>> arg0) throws Exception { ArrayList<Tuple2<Long, Integer>> list = new ArrayList<Tuple2<Long, Integer>>(1); if (arg0._2._1 != 0) { list.add(new Tuple2<Long, Integer>(arg0._2._2._2, arg0._2._1)); } return list; } }); JavaPairRDD<Long, Iterable<Tuple2<Long, Integer>>> groupedRdd = nonZeroMostLikelyByTimestamp.groupBy( new Function<Tuple2<Long, Integer>, Long>() { @Override public Long call(Tuple2<Long, Integer> arg0) throws Exception { return arg0._1; } }); List<Long> timestampsList = groupedRdd.keys().collect(); Collections.sort(timestampsList); JavaPairRDD<Long, Map<Integer, Integer>> resultByTimestamp = groupedRdd.mapValues( new Function<Iterable<Tuple2<Long, Integer>>, Map<Integer, Integer>>() { @Override public Map<Integer, Integer> call(Iterable<Tuple2<Long, Integer>> arg0) throws Exception { Map<Integer, Integer> countMap = new HashMap<Integer, Integer>(); for (Tuple2<Long, Integer> tuple : arg0) { if (countMap.containsKey(tuple._2)) { countMap.put(tuple._2, countMap.get(tuple._2) + 1); } else { countMap.put(tuple._2, 1); } } return countMap; } }); List<Tuple2<Long, Map<Integer, Integer>>> collectedResults = resultByTimestamp.collect(); Collections.sort( collectedResults, new Comparator<Tuple2<Long, Map<Integer, Integer>>>() { @Override public int compare( Tuple2<Long, Map<Integer, Integer>> index1, Tuple2<Long, Map<Integer, Integer>> index2) { return index1._1.compareTo(index2._1); } }); // Printing in the appropriate format for the csv file for (int themeIndex = 1; themeIndex < numberOfThemes; themeIndex++) { System.out.println("Theme " + (themeIndex - 1)); Iterator<Tuple2<Long, Map<Integer, Integer>>> resultsIterator = collectedResults.iterator(); Iterator<Long> timestampsIterator = timestampsList.iterator(); Tuple2<Long, Map<Integer, Integer>> currentTuple = resultsIterator.next(); while (timestampsIterator.hasNext()) { long timestamp = timestampsIterator.next(); if (currentTuple != null && timestamp == currentTuple._1) { int strength = currentTuple._2().get(themeIndex) == null ? 0 : currentTuple._2().get(themeIndex); System.out.println(timestamp + "," + strength); if (resultsIterator.hasNext()) { currentTuple = resultsIterator.next(); } else { currentTuple = null; } } else { System.out.println(timestamp + ",0"); } } } // old way to do it int timeDuration = collectedResults.size(); int[][] themesStrengths = new int[(int) numberOfThemes][timeDuration]; for (int timeIndex = 0; timeIndex < timeDuration; timeIndex++) { Tuple2<Long, Map<Integer, Integer>> tuple = collectedResults.get(timeIndex); for (Entry<Integer, Integer> entry : tuple._2.entrySet()) { themesStrengths[entry.getKey() - 1][timeIndex] = entry.getValue(); } } System.out.println(""); for (int i = 0; i < numberOfThemes; i++) { System.out.print("themeStrength_" + i + " = ["); for (int j = 0; j < timeDuration; j++) { System.out.print(" " + themesStrengths[i][j]); } System.out.println("];"); } }