public static void main(String[] args) { String logFile; if (args.length != 0) logFile = args[0]; else logFile = "/media/gf/Java/spark-1.4.0-bin-hadoop2.6/README.md"; final SparkConf conf = new SparkConf().setAppName("Simple Application"); final JavaSparkContext sc = new JavaSparkContext(conf); final JavaRDD<String> logData = sc.textFile(logFile).cache(); final String[] check = getFilterSet(); System.out.println("Start: " + new Date()); for (int i = 0; i < check.length; i++) { final int post = i; long count = logData .filter( new Function<String, Boolean>() { public Boolean call(String s) { return s.contains(check[post]); } }) .count(); System.out.println("Lines with " + check[i] + ": " + count); } System.out.println("End: " + new Date()); sc.close(); }
@AfterClass public static void tearDown() { if (javaSparkContext != null) { javaSparkContext.close(); javaSparkContext = null; } }
@After public void tearDown() throws Exception { if (sc != null) { sc.stop(); sc.close(); } }
public static boolean SpatialRangeQuery( String InputLocation1, String InputLocation2, String OutputLocation) { SparkConf sparkConfiguration = new SparkConf().setAppName("Group22-RangeQuery"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConfiguration); boolean result = getRangeQuery(InputLocation1, InputLocation2, OutputLocation, sparkContext); sparkContext.close(); return result; }
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Distinct"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Integer> nums = sc.parallelize( Arrays.asList(1, 2, 3, 4, 5, 1, 3, 2, 2, 1, 3, 4, 5, 5, 4, 3, 1, 2, 3, 2, 6, 8, 0)); JavaRDD<Integer> distinct = nums.distinct(); System.out.println(StringUtils.join(distinct.collect(), ",")); sc.close(); }
public static void main(String[] args) throws FileNotFoundException { String path = args[0]; // This is the default 2 line structure for spark programs in java // The spark.executor.memory can only take the maximum java heapspace set by -Xmx SparkConf conf = new SparkConf() .setMaster("local[" + NUM_THREADS + "]") .setAppName(Demo.class.getSimpleName()) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaSparkContext sc = new JavaSparkContext(conf); long start = System.nanoTime(); // if you need both the coordinates and the sequences, use this section of code // read sequence file and map to PdbId.chainId, SimplePolymerChain pairs List<Tuple2<String, SimplePolymerChain>> chains = sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD) .sample(false, 0.01, 123) .mapToPair( new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId, // SimplePolymerChain> pairs .filter(t -> t._2.isProtein()) .collect(); for (Tuple2<String, SimplePolymerChain> t : chains) { System.out.println(t._1 + ": " + t._2); } // if you need just the coordinates, use this section of code // read sequence file and map to PdbId.chainId, C-alpha coordinate pairs List<Tuple2<String, Point3d[]>> coordinates = sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD) .sample(false, 0.01, 123) .mapToPair( new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId, protein // sequence> pairs .filter(t -> t._2.isProtein()) .mapToPair(t -> new Tuple2<String, Point3d[]>(t._1, t._2.getCoordinates())) .collect(); for (Tuple2<String, Point3d[]> t : coordinates) { System.out.println(t._1 + ": " + Arrays.toString(t._2)); } sc.close(); System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec."); }
/** * @param args Path of the hadoop sequence file * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException { String path = args[0]; JavaSparkContext sc = getSparkContext(); // sc is an existing JavaSparkContext. SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy"); sqlContext.setConf("spark.sql.parquet.filterPushdown", "true"); long start = System.nanoTime(); // read sequence file and map JavaRDD<Row> rowRDD = sc.sequenceFile(path, Text.class, Text.class) // .sample(false, 0.01, 123) .mapToPair(t -> new Tuple2<String, String>(t._1.toString(), t._2.toString())) .groupByKey() .map(new HadoopToParqRow()) .cache(); List<StructField> fields = new ArrayList<StructField>(); // create data fields of features for the DataFrame fields.add(DataTypes.createStructField("index", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("chainId1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("chainId2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Rnum1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Rnum2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Ins1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("Ins2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("res1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("res2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("atom1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("atom2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("element1", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("element2", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("distance", DataTypes.IntegerType, false)); fields.add( DataTypes.createStructField( "pdbId", DataTypes.createArrayType(DataTypes.StringType), false)); StructType schema = DataTypes.createStructType(fields); // Apply the schema to the RDD. DataFrame dataFrame = sqlContext.createDataFrame(rowRDD, schema); dataFrame .coalesce(1) .write() .mode(SaveMode.Overwrite) .partitionBy("index") .parquet("/Users/hina/Data/ExampleFiles/seq.parquet"); sc.close(); System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec."); }
private static void trainAdaBoost() { int imgSize = 24; int T = 300; // String root = "/home/hadoop/ProgramDatas/MLStudy/FaceDection/"; String root = "E:/TestDatas/MLStudy/FaceDection/"; String dataFile = root + "train_data.txt"; String modelFile = root + "adaboost_model.txt"; String sparkAppName = "Viola-Jones Train"; String sparkMaster = "spark://localhost:7077"; int sparkCores = 60; String sparkJars = "/home/hadoop/violajones.jar"; checkMemoryInfo(); System.out.println("initing feature templates..."); List<FeatureTemplate> templates = FeatureTemplate.initFeaTemplates(); System.out.println("initing features..."); List<HaarLikeFeature> features = HaarLikeFeature.initFeatures(imgSize, imgSize, templates); System.out.println("loading train datas..."); List<IntegralImage> trainDatas = FileUtils.loadTrainDatas(dataFile, imgSize, imgSize); Collections.shuffle(trainDatas); SparkConf sparkConf = new SparkConf() .setMaster(sparkMaster) .setAppName(sparkAppName) .set("spark.executor.memory", "2g"); JavaSparkContext sc = new JavaSparkContext(sparkConf); sc.addJar(sparkJars); sc.setLogLevel("WARN"); System.out.println("training adaboost..."); AdaBoost adaBoost = new AdaBoost(trainDatas, features); Map<HaarLikeFeature, Double> classifiers = adaBoost.train(sc, sparkCores, T); System.out.println("exporting model..."); List<String> model = new ArrayList<String>(); for (Entry<HaarLikeFeature, Double> item : classifiers.entrySet()) { model.add(item.getKey().toStringWithWeight(item.getValue())); } FileUtils.exportFile(modelFile, model); System.out.println("viola jones training success!"); sc.close(); }
private static void trainCascadeAdaBoost() { int imgSize = 30; double eachDR = 0.99, eachFAR = 0.5, finalFAR = 0; String root = "/home/hadoop/ProgramDatas/MLStudy/FaceDection/"; // String root = "E:/TestDatas/MLStudy/FaceDection/"; String dataFile = root + "train_data_2.txt"; String modelFile = root + "CascadeAdaboost_model.txt"; String misclassificationFile = root + "mis_classifications.txt"; String sparkAppName = "Viola-Jones Train"; String sparkMaster = "spark://localhost:7077"; int sparkCores = 60; String sparkJars = "/home/hadoop/violajones.jar"; checkMemoryInfo(); System.out.println("initing feature templates..."); List<FeatureTemplate> templates = FeatureTemplate.initFeaTemplates(); System.out.println("initing features..."); List<HaarLikeFeature> features = HaarLikeFeature.initFeatures(imgSize, imgSize, templates); System.out.println("loading train datas..."); Map<Integer, List<IntegralImage>> trainDatas = FileUtils.loadTrainDatasSeparate(dataFile, imgSize, imgSize); List<IntegralImage> posDatas = trainDatas.get(1); List<IntegralImage> negDatas = trainDatas.get(0); SparkConf sparkConf = new SparkConf() .setMaster(sparkMaster) .setAppName(sparkAppName) .set("spark.executor.memory", "3g"); JavaSparkContext sc = new JavaSparkContext(sparkConf); sc.addJar(sparkJars); sc.setLogLevel("WARN"); System.out.println("training cascade adaboost..."); CascadeAdaBoost cascade = new CascadeAdaBoost(posDatas, negDatas, features); CascadeClassifier classifier = cascade.train(sc, sparkCores, eachDR, eachFAR, finalFAR, modelFile, misclassificationFile); System.out.println("exporting model..."); FileUtils.exportFile(modelFile, classifier.exportModel()); sc.close(); }
public static void main(String[] args) { // Create a Spark Context. SparkConf conf = new SparkConf().setAppName("Activity").set("spark.eventLog.enabled", "true"); ; JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext jssc = new JavaStreamingContext(sc, STREAM_INTERVAL); String TOPIC = "activityevent"; String zkQuorum = "localhost:2181"; String group = "1"; Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put(TOPIC, 1); JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zkQuorum, group, topicMap); // messages.print(); JavaDStream<String> activitydatastream = messages.map( new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); final Long teamWindowDurationMs = Durations.minutes(1).milliseconds(); JavaDStream<Activity> ActivityEntryDStream = activitydatastream.map(Activity::parseFromLine); JavaPairDStream<WithTimestamp<String>, Double> ActivityWindowDStream = ActivityEntryDStream.mapToPair( windows -> new Tuple2<>( WithTimestamp.create( windows.getActivity(), // Apply Fixed Window by rounding the timestamp down to the nearest // multiple of the window size (convertMillsecs(windows.getTimestamp()) / teamWindowDurationMs) * teamWindowDurationMs), windows.getXaxis())) .reduceByKey(SUM_REDUCER); ActivityWindowDStream.print(); jssc.start(); jssc.awaitTermination(); // jssc.close(); sc.stop(); sc.close(); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Example_2_2").setMaster("local[2]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.parallelize(Arrays.asList("hello world", "hi")); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String line) { return Arrays.asList(line.split(" ")); } }); String first = words.first(); System.out.println(first); sc.close(); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local").setAppName("My App"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("src/main/resources/data.txt"); @SuppressWarnings("serial") JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + "-> " + tuple._2()); } sc.close(); }
public static void main(String[] args) { SparkConf sparkconf = new SparkConf() .setAppName("Simple Application") .setMaster("spark://1.245.77.10:7077") .set( "spark.driver.extraClassPath", "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*") .set( "spark.executor.extraClassPath", "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*") .set("fs.default.name", "file:///"); JavaSparkContext sc = new JavaSparkContext(sparkconf); Configuration hadoopConfig = sc.hadoopConfiguration(); hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw-0.2.jar"); // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw4spark-0.2.jar"); /*JavaRDD<Double> matrdd2 = sc.parallelize(Arrays.asList(1.0, 3.0, 2.0)); System.out.println("Start counting parallelize..."); long values = matrdd2.count(); System.out.println("Value count of parallelize is " + values);*/ JavaPairRDD<Long, Double> matrdd = sc.newAPIHadoopFile( "e:/tmp/vecRow03_x256.mat", JMATFileInputFormat.class, Long.class, Double.class, hadoopConfig); System.out.println("Start job..."); long values = matrdd.count(); System.out.println("Value count of hadoop is " + values); sc.stop(); sc.close(); }
public static void main(String[] args) { String master = args.length > 0 ? args[0] : "local[4]"; JavaSparkContext sc = new JavaSparkContext(master, SparkSumByFold.class.getSimpleName()); JavaRDD<Integer> oddNos = sc.parallelize(Arrays.asList(1, 3, 5, 3, 7, 7, 9, 1)); JavaRDD<Integer> evenNos = sc.parallelize(Arrays.asList(2, 4, 2, 2, 2, 6, 0, 4)); JavaRDD<Integer> allNos = oddNos.union(evenNos); System.out.println("printing all nos."); // print the all nos. allNos.foreach( new VoidFunction<Integer>() { @Override public void call(Integer t) throws Exception { System.out.println(t); } }); sc.close(); }
public static void main(String[] args) throws FileNotFoundException { if (args.length <= 0) { System.out.println( "We require input file path, output file path and number of partitions argument to proceed further."); System.out.println( "Usage: java FarthestPair <input file path> <output file path> <noOfPartitions>"); System.exit(0); } String inputFile = args[0]; SparkConf conf = new SparkConf().setAppName("Group6-FarthestPair"); JavaSparkContext sc = new JavaSparkContext(conf); // Read file as RDD JavaRDD<String> inputData = sc.textFile(inputFile); // JavaRDD<Coordinate> coordinates = inputData.mapPartitions(parseData); // Map each String in the file as a coordinate object JavaRDD<Coordinate> coordinates = inputData.map(parseData); // .repartition(noOfPartitions); // Map to a tuple to sort the Points JavaPairRDD<Coordinate, Boolean> pointTupleRDD = coordinates.mapToPair(new CoordinatePairFunction()); // Sort the points JavaPairRDD<Coordinate, Boolean> sortedPointTupleRDD = pointTupleRDD.sortByKey(new CoordinateComparator()); // Map to points RDD JavaRDD<Coordinate> finalSortedPointRDD = sortedPointTupleRDD.map(new TupleToCoordinateMapFunction()); // Convert sorted collection to RDD // Perform Convex hull operation on individual partition JavaRDD<Coordinate> localHull = finalSortedPointRDD.mapPartitions(new hull()); // Repartition to 1 partition in order to apply 'convex hull' on all the Coordinate objects // obtained from individual partitions JavaRDD<Coordinate> calculatedHull = localHull.coalesce(1).cache(); // Perform Convex hull operation JavaRDD<Coordinate> globalHull = calculatedHull.mapPartitions(new hull()).distinct(); JavaPairRDD<Coordinate, Coordinate> allCoordinateTuples = globalHull.cartesian(globalHull); System.out.println("Total cart: " + allCoordinateTuples.collect().size()); JavaRDD<Pair> pairsRDD = allCoordinateTuples.map( new Function<Tuple2<Coordinate, Coordinate>, Pair>() { public Pair call(Tuple2<Coordinate, Coordinate> tuple) throws Exception { // TODO Auto-generated method stub Coordinate pointA = tuple._1(); Coordinate pointB = tuple._2(); Pair a = new Pair(pointA, pointB); return a; } }); JavaRDD<Pair> pairs = allCoordinateTuples.mapPartitions( new FlatMapFunction<Iterator<Tuple2<Coordinate, Coordinate>>, Pair>() { /** */ private static final long serialVersionUID = 1L; public Iterable<Pair> call(Iterator<Tuple2<Coordinate, Coordinate>> tuples) throws Exception { // TODO Auto-generated method stub List<Pair> pairsFromTuples = new ArrayList<Pair>(); // Pair singlePair = new Pair(); Tuple2<Coordinate, Coordinate> tuple; while (tuples.hasNext()) { tuple = tuples.next(); // singlePair.A = tuples.next()._1; // singlePair.B = tuples.next()._2; Pair singlePair = new Pair(tuple._1(), tuple._2()); pairsFromTuples.add(singlePair); } return pairsFromTuples; } }); JavaRDD<Integer> x = pairsRDD.mapPartitions( new FlatMapFunction<Iterator<Pair>, Integer>() { public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception { // TODO Auto-generated method stub ArrayList<Integer> x = new ArrayList<Integer>(); x.add(1); return x; } }); System.out.println("Num of partitions: " + x.collect()); JavaRDD<Integer> y = pairs.mapPartitions( new FlatMapFunction<Iterator<Pair>, Integer>() { public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception { // TODO Auto-generated method stub ArrayList<Integer> x = new ArrayList<Integer>(); x.add(1); return x; } }); System.out.println("Num of partitions charan: " + y.collect()); Pair minDistPair = pairs.reduce( new Function2<Pair, Pair, Pair>() { /** */ private static final long serialVersionUID = 1L; public Pair call(Pair a, Pair b) throws Exception { // TODO Auto-generated method stub return (a.distanceLength > b.distanceLength ? a : b); } }); // System.out.println(minDistPair); Coordinate closestpointA = minDistPair.A; Coordinate closestpointB = minDistPair.B; List<Coordinate> closestPoints = new ArrayList<Coordinate>(); closestPoints.add(closestpointA); closestPoints.add(closestpointB); JavaRDD<Coordinate> closestRDD = sc.parallelize(closestPoints); // Map to a tuple to sort the Points JavaPairRDD<Coordinate, Boolean> coordinateTupleRDD = closestRDD.mapToPair(new CoordinatePairFunction()); // Sort the points JavaPairRDD<Coordinate, Boolean> sortedCoordinateTupleRDD = coordinateTupleRDD.sortByKey(new CoordinateComparator()); // Map to points RDD JavaRDD<Coordinate> finalSortedCoordinateRDD = sortedCoordinateTupleRDD.map(new TupleToCoordinateMapFunction()); JavaRDD<String> outputData = finalSortedCoordinateRDD.map(parseOutputData); // closestRDD.saveAsTextFile(outputfilepath); outputData.saveAsTextFile(args[1]); // Output your result, you need to sort your result!!! // And,Don't add a additional clean up step delete the new generated // file... sc.close(); }
public static void main(String[] args) throws IOException { Parameters param = new Parameters(); long initTime = System.currentTimeMillis(); SparkConf conf = new SparkConf().setAppName("StarJoin"); JavaSparkContext sc = new JavaSparkContext(conf); if (param.useKryo) { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName()); conf.set("spark.kryoserializer.buffer.mb", param.buffer); } MyBloomFilter.BloomFilter<String> BFS = new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes); MyBloomFilter.BloomFilter<String> BFD = new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes); MyBloomFilter.BloomFilter<String> BFC = new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes); JavaPairRDD<String, String> supps = sc.textFile(param.suppPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> s = supps.collect(); for (int i = 0; i < s.size(); i++) { BFS.add(s.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS); JavaPairRDD<String, String> custs = sc.textFile(param.custPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> c = custs.collect(); for (int i = 0; i < c.size(); i++) { BFC.add(c.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC); JavaPairRDD<String, String> dates = sc.textFile(param.datePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[6].equals("Dec1997"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[4]); } }); List<Tuple2<String, String>> d = dates.collect(); for (int i = 0; i < d.size(); i++) { BFD.add(d.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD); JavaPairRDD<String, String[]> lines = sc.textFile(param.linePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return varC.value().contains(s[2].getBytes()) & varS.value().contains(s[4].getBytes()) & varD.value().contains(s[5].getBytes()); } }) .mapToPair( new PairFunction<String[], String, String[]>() { public Tuple2<String, String[]> call(String[] s) { String[] v = {s[2], s[5], s[12]}; return new Tuple2<String, String[]>(s[4], v); } }); JavaPairRDD<String, String[]> result = lines .join(supps) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); result = result .join(custs) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); JavaPairRDD<String, Long> final_result = result .join(dates) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) { return new Tuple2<String, Long>( s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0])); } }) .reduceByKey( new Function2<Long, Long, Long>() { public Long call(Long i1, Long i2) { return i1 + i2; } }); JavaPairRDD<String, String> sub_result = final_result.mapToPair( new PairFunction<Tuple2<String, Long>, String, String>() { public Tuple2<String, String> call(Tuple2<String, Long> line) { return new Tuple2(line._1 + "," + line._2.toString(), null); } }); final_result = sub_result .sortByKey(new Q3Comparator()) .mapToPair( new PairFunction<Tuple2<String, String>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, String> line) { String[] s = line._1.split(","); return new Tuple2<String, Long>( s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3])); } }); Configuration HDFSconf = new Configuration(); FileSystem fs = FileSystem.get(HDFSconf); fs.delete(new Path(param.output), true); final_result.saveAsTextFile(param.output); long finalTime = System.currentTimeMillis(); System.out.print("Tempo total(ms): "); System.out.println(finalTime - initTime); sc.close(); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SpatialJoinQuery Application"); JavaSparkContext sc = new JavaSparkContext(conf); // Read the input csv file holding set of polygons in a rdd of string // objects JavaRDD<String> firstInputPoints = sc.textFile("hdfs://192.168.139.149:54310/harsh/spatialJoinFirstInput.csv"); // Map the above rdd of strings to a rdd of rectangles for the first // input // Repeat the above process but now for initializing the rdd for query // window JavaRDD<String> secondInputPoints = sc.textFile("hdfs://192.168.139.149:54310/harsh/spatialJoinSecondInput.csv"); JavaRDD<Tuple2<Integer, ArrayList<Integer>>> joinQueryRDD = new JavaRDD<Tuple2<Integer, ArrayList<Integer>>>(null, null); if (args[0].equalsIgnoreCase("rectangle")) { System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); System.out.println("inside>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); final JavaRDD<Rectangle> firstInputRDD = firstInputPoints.map(mapInputStringToRectRDD()); System.out.println(firstInputRDD.collect()); // Map the query window to RDD object final JavaRDD<Rectangle> secondInputRDD = secondInputPoints.map(mapInputStringToRectRDD()); // broadcast the second set of rectangles to each of the workers final Broadcast<List<Rectangle>> firstInput = sc.broadcast(firstInputRDD.collect()); // map the id of first input to the multiple id’s of the second // input if // they contain the // first rectangle. joinQueryRDD = secondInputRDD.map( new Function<Rectangle, Tuple2<Integer, ArrayList<Integer>>>() { public Tuple2<Integer, ArrayList<Integer>> call(Rectangle rectangle) throws Exception { // Get the list of rectangles from the second RDD input. List<Rectangle> firstInputCollection = firstInput.value(); ArrayList<Integer> secondInputIds = new ArrayList<Integer>(); // Iterate the second input and check for the second set // of rectangle id’s // that hold the rectangle from first set obtained from // the mapped RDD for (Rectangle firstRects : firstInputCollection) { if (rectangle.isRectangleinsideQueryWindow(firstRects)) { secondInputIds.add(firstRects.getRectangleId()); } } // Create a new tuple of the mapped values and return // back the mapped // transformation. Tuple2<Integer, ArrayList<Integer>> resultList = new Tuple2<Integer, ArrayList<Integer>>( rectangle.getRectangleId(), secondInputIds); return resultList; } }); } else if (args[0].equalsIgnoreCase("point")) { final JavaRDD<Point> firstInputRDD = firstInputPoints.map(SpatialRangeQuery.mapInputStringToPointRDD()); // broadcast the second set of rectangles to each of the workers final Broadcast<List<Point>> firstInput = sc.broadcast(firstInputRDD.collect()); // Map the query window to RDD object final JavaRDD<Rectangle> secondInputRDD = secondInputPoints.map(mapInputStringToRectRDD()); joinQueryRDD = secondInputRDD.map( new Function<Rectangle, Tuple2<Integer, ArrayList<Integer>>>() { public Tuple2<Integer, ArrayList<Integer>> call(Rectangle rectangle) throws Exception { // Get the list of rectangles from the second RDD input. List<Point> firstInputCollection = firstInput.getValue(); ArrayList<Integer> secondInputIds = new ArrayList<Integer>(); // Iterate the second input and check for the second set // of rectangle id’s // that hold the rectangle from first set obtained from // the mapped RDD for (Point point : firstInputCollection) { if (point.isPointinsideQueryWindow(rectangle)) { secondInputIds.add(point.getPointID()); } } // Create a new tuple of the mapped values and return // back the mapped // transformation. Tuple2<Integer, ArrayList<Integer>> resultList = new Tuple2<Integer, ArrayList<Integer>>( rectangle.getRectangleId(), secondInputIds); return resultList; } }); } JavaRDD<String> result = joinQueryRDD.map( new Function<Tuple2<Integer, ArrayList<Integer>>, String>() { public String call(Tuple2<Integer, ArrayList<Integer>> inputPoint) { Integer containingRect = inputPoint._1(); ArrayList<Integer> containedRects = inputPoint._2(); StringBuffer intermediateBuffer = new StringBuffer(); intermediateBuffer.append(containingRect); for (Integer rects : containedRects) { intermediateBuffer.append(", " + rects); } return intermediateBuffer.toString(); } }); result.coalesce(1).saveAsTextFile("hdfs://192.168.139.149:54310/harsh/jQueryResult.csv"); sc.close(); }