public static void main(String[] args) throws IOException { Parameters param = new Parameters(); long initTime = System.currentTimeMillis(); SparkConf conf = new SparkConf().setAppName("StarJoin"); JavaSparkContext sc = new JavaSparkContext(conf); if (param.useKryo) { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName()); conf.set("spark.kryoserializer.buffer.mb", param.buffer); } MyBloomFilter.BloomFilter<String> BFS = new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes); MyBloomFilter.BloomFilter<String> BFD = new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes); MyBloomFilter.BloomFilter<String> BFC = new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes); JavaPairRDD<String, String> supps = sc.textFile(param.suppPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> s = supps.collect(); for (int i = 0; i < s.size(); i++) { BFS.add(s.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS); JavaPairRDD<String, String> custs = sc.textFile(param.custPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> c = custs.collect(); for (int i = 0; i < c.size(); i++) { BFC.add(c.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC); JavaPairRDD<String, String> dates = sc.textFile(param.datePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[6].equals("Dec1997"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[4]); } }); List<Tuple2<String, String>> d = dates.collect(); for (int i = 0; i < d.size(); i++) { BFD.add(d.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD); JavaPairRDD<String, String[]> lines = sc.textFile(param.linePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return varC.value().contains(s[2].getBytes()) & varS.value().contains(s[4].getBytes()) & varD.value().contains(s[5].getBytes()); } }) .mapToPair( new PairFunction<String[], String, String[]>() { public Tuple2<String, String[]> call(String[] s) { String[] v = {s[2], s[5], s[12]}; return new Tuple2<String, String[]>(s[4], v); } }); JavaPairRDD<String, String[]> result = lines .join(supps) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); result = result .join(custs) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); JavaPairRDD<String, Long> final_result = result .join(dates) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) { return new Tuple2<String, Long>( s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0])); } }) .reduceByKey( new Function2<Long, Long, Long>() { public Long call(Long i1, Long i2) { return i1 + i2; } }); JavaPairRDD<String, String> sub_result = final_result.mapToPair( new PairFunction<Tuple2<String, Long>, String, String>() { public Tuple2<String, String> call(Tuple2<String, Long> line) { return new Tuple2(line._1 + "," + line._2.toString(), null); } }); final_result = sub_result .sortByKey(new Q3Comparator()) .mapToPair( new PairFunction<Tuple2<String, String>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, String> line) { String[] s = line._1.split(","); return new Tuple2<String, Long>( s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3])); } }); Configuration HDFSconf = new Configuration(); FileSystem fs = FileSystem.get(HDFSconf); fs.delete(new Path(param.output), true); final_result.saveAsTextFile(param.output); long finalTime = System.currentTimeMillis(); System.out.print("Tempo total(ms): "); System.out.println(finalTime - initTime); sc.close(); }
public static void main(String[] args) throws FileNotFoundException { if (args.length <= 0) { System.out.println( "We require input file path, output file path and number of partitions argument to proceed further."); System.out.println( "Usage: java FarthestPair <input file path> <output file path> <noOfPartitions>"); System.exit(0); } String inputFile = args[0]; SparkConf conf = new SparkConf().setAppName("Group6-FarthestPair"); JavaSparkContext sc = new JavaSparkContext(conf); // Read file as RDD JavaRDD<String> inputData = sc.textFile(inputFile); // JavaRDD<Coordinate> coordinates = inputData.mapPartitions(parseData); // Map each String in the file as a coordinate object JavaRDD<Coordinate> coordinates = inputData.map(parseData); // .repartition(noOfPartitions); // Map to a tuple to sort the Points JavaPairRDD<Coordinate, Boolean> pointTupleRDD = coordinates.mapToPair(new CoordinatePairFunction()); // Sort the points JavaPairRDD<Coordinate, Boolean> sortedPointTupleRDD = pointTupleRDD.sortByKey(new CoordinateComparator()); // Map to points RDD JavaRDD<Coordinate> finalSortedPointRDD = sortedPointTupleRDD.map(new TupleToCoordinateMapFunction()); // Convert sorted collection to RDD // Perform Convex hull operation on individual partition JavaRDD<Coordinate> localHull = finalSortedPointRDD.mapPartitions(new hull()); // Repartition to 1 partition in order to apply 'convex hull' on all the Coordinate objects // obtained from individual partitions JavaRDD<Coordinate> calculatedHull = localHull.coalesce(1).cache(); // Perform Convex hull operation JavaRDD<Coordinate> globalHull = calculatedHull.mapPartitions(new hull()).distinct(); JavaPairRDD<Coordinate, Coordinate> allCoordinateTuples = globalHull.cartesian(globalHull); System.out.println("Total cart: " + allCoordinateTuples.collect().size()); JavaRDD<Pair> pairsRDD = allCoordinateTuples.map( new Function<Tuple2<Coordinate, Coordinate>, Pair>() { public Pair call(Tuple2<Coordinate, Coordinate> tuple) throws Exception { // TODO Auto-generated method stub Coordinate pointA = tuple._1(); Coordinate pointB = tuple._2(); Pair a = new Pair(pointA, pointB); return a; } }); JavaRDD<Pair> pairs = allCoordinateTuples.mapPartitions( new FlatMapFunction<Iterator<Tuple2<Coordinate, Coordinate>>, Pair>() { /** */ private static final long serialVersionUID = 1L; public Iterable<Pair> call(Iterator<Tuple2<Coordinate, Coordinate>> tuples) throws Exception { // TODO Auto-generated method stub List<Pair> pairsFromTuples = new ArrayList<Pair>(); // Pair singlePair = new Pair(); Tuple2<Coordinate, Coordinate> tuple; while (tuples.hasNext()) { tuple = tuples.next(); // singlePair.A = tuples.next()._1; // singlePair.B = tuples.next()._2; Pair singlePair = new Pair(tuple._1(), tuple._2()); pairsFromTuples.add(singlePair); } return pairsFromTuples; } }); JavaRDD<Integer> x = pairsRDD.mapPartitions( new FlatMapFunction<Iterator<Pair>, Integer>() { public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception { // TODO Auto-generated method stub ArrayList<Integer> x = new ArrayList<Integer>(); x.add(1); return x; } }); System.out.println("Num of partitions: " + x.collect()); JavaRDD<Integer> y = pairs.mapPartitions( new FlatMapFunction<Iterator<Pair>, Integer>() { public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception { // TODO Auto-generated method stub ArrayList<Integer> x = new ArrayList<Integer>(); x.add(1); return x; } }); System.out.println("Num of partitions charan: " + y.collect()); Pair minDistPair = pairs.reduce( new Function2<Pair, Pair, Pair>() { /** */ private static final long serialVersionUID = 1L; public Pair call(Pair a, Pair b) throws Exception { // TODO Auto-generated method stub return (a.distanceLength > b.distanceLength ? a : b); } }); // System.out.println(minDistPair); Coordinate closestpointA = minDistPair.A; Coordinate closestpointB = minDistPair.B; List<Coordinate> closestPoints = new ArrayList<Coordinate>(); closestPoints.add(closestpointA); closestPoints.add(closestpointB); JavaRDD<Coordinate> closestRDD = sc.parallelize(closestPoints); // Map to a tuple to sort the Points JavaPairRDD<Coordinate, Boolean> coordinateTupleRDD = closestRDD.mapToPair(new CoordinatePairFunction()); // Sort the points JavaPairRDD<Coordinate, Boolean> sortedCoordinateTupleRDD = coordinateTupleRDD.sortByKey(new CoordinateComparator()); // Map to points RDD JavaRDD<Coordinate> finalSortedCoordinateRDD = sortedCoordinateTupleRDD.map(new TupleToCoordinateMapFunction()); JavaRDD<String> outputData = finalSortedCoordinateRDD.map(parseOutputData); // closestRDD.saveAsTextFile(outputfilepath); outputData.saveAsTextFile(args[1]); // Output your result, you need to sort your result!!! // And,Don't add a additional clean up step delete the new generated // file... sc.close(); }
public static void main(String[] args) { // STEP 1: create a SparkConf object if (args.length < 1) { log.fatal("Syntax Error: there must be one argument (a file name or a directory)"); throw new RuntimeException(); } // STEP 2: create a SparkConf object SparkConf sparkConf = new SparkConf().setAppName("Trending Topic"); // STEP 3: create a Java Spark context JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); // STEP 4: read lines of files JavaRDD<String> lines = sparkContext.textFile(args[0]); JavaRDD<String> words; words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable call(String s) throws Exception { return Arrays.asList(s.split("\t")[2].split(" ")); } }); JavaPairRDD<String, Integer> ones; ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String string) { return new Tuple2<>(string, 1); } }); JavaPairRDD<String, Integer> counts; counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer integer, Integer integer2) throws Exception { return integer + integer2; } }); // Es necesario invertir las tuplas ya que no podemos ordenar por valor, sino por clave JavaPairRDD<Integer, String> swapped; swapped = counts.mapToPair( new PairFunction<Tuple2<String, Integer>, Integer, String>() { @Override public Tuple2<Integer, String> call(Tuple2<String, Integer> tupla) throws Exception { return tupla.swap(); } }); // STEP 7: sort the results by key List<Tuple2<Integer, String>> output = swapped.sortByKey().collect(); // El ejercicio dice que quitemos las palabras que no aportan nada. Para ello podríamos ponerlas // en un fichero y leerlas y luego obviar esas. Vamos a obviar esa parte ya que se entiende y no // es el caso del ejercicio List<String> excluyentes = new LinkedList<>(); excluyentes.add("rt"); excluyentes.add("http"); excluyentes.add("https"); excluyentes.add("www"); for (Tuple2<Integer, String> t : output) { if (excluyentes.contains(t._2)) { output.remove(t); } } // STEP 8: print the results for (int i = 0; i < 10; i++) { Tuple2<Integer, String> tuple; tuple = output.get(i); System.out.println(tuple._2() + ": " + tuple._1()); } // STEP 9: stop the spark context sparkContext.stop(); }