Exemple #1
0
  public static void main(String[] args) {
    String logFile;

    if (args.length != 0) logFile = args[0];
    else logFile = "/media/gf/Java/spark-1.4.0-bin-hadoop2.6/README.md";

    final SparkConf conf = new SparkConf().setAppName("Simple Application");

    final JavaSparkContext sc = new JavaSparkContext(conf);
    final JavaRDD<String> logData = sc.textFile(logFile).cache();

    final String[] check = getFilterSet();

    System.out.println("Start: " + new Date());
    for (int i = 0; i < check.length; i++) {
      final int post = i;
      long count =
          logData
              .filter(
                  new Function<String, Boolean>() {
                    public Boolean call(String s) {
                      return s.contains(check[post]);
                    }
                  })
              .count();

      System.out.println("Lines with " + check[i] + ": " + count);
    }
    System.out.println("End: " + new Date());

    sc.close();
  }
 @AfterClass
 public static void tearDown() {
   if (javaSparkContext != null) {
     javaSparkContext.close();
     javaSparkContext = null;
   }
 }
 @After
 public void tearDown() throws Exception {
   if (sc != null) {
     sc.stop();
     sc.close();
   }
 }
  public static boolean SpatialRangeQuery(
      String InputLocation1, String InputLocation2, String OutputLocation) {

    SparkConf sparkConfiguration = new SparkConf().setAppName("Group22-RangeQuery");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConfiguration);
    boolean result = getRangeQuery(InputLocation1, InputLocation2, OutputLocation, sparkContext);
    sparkContext.close();
    return result;
  }
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Distinct");
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<Integer> nums =
        sc.parallelize(
            Arrays.asList(1, 2, 3, 4, 5, 1, 3, 2, 2, 1, 3, 4, 5, 5, 4, 3, 1, 2, 3, 2, 6, 8, 0));
    JavaRDD<Integer> distinct = nums.distinct();

    System.out.println(StringUtils.join(distinct.collect(), ","));

    sc.close();
  }
  public static void main(String[] args) throws FileNotFoundException {
    String path = args[0];

    // This is the default 2 line structure for spark programs in java
    // The spark.executor.memory can only take the maximum java heapspace set by -Xmx
    SparkConf conf =
        new SparkConf()
            .setMaster("local[" + NUM_THREADS + "]")
            .setAppName(Demo.class.getSimpleName())
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");

    JavaSparkContext sc = new JavaSparkContext(conf);

    long start = System.nanoTime();

    // if you need both the coordinates and the sequences, use this section of code
    // read sequence file and map to PdbId.chainId, SimplePolymerChain pairs
    List<Tuple2<String, SimplePolymerChain>> chains =
        sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD)
            .sample(false, 0.01, 123)
            .mapToPair(
                new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId,
                                                 // SimplePolymerChain> pairs
            .filter(t -> t._2.isProtein())
            .collect();

    for (Tuple2<String, SimplePolymerChain> t : chains) {
      System.out.println(t._1 + ": " + t._2);
    }

    // if you need just the coordinates, use this section of code
    // read sequence file and map to PdbId.chainId, C-alpha coordinate pairs
    List<Tuple2<String, Point3d[]>> coordinates =
        sc.sequenceFile(path, Text.class, ArrayWritable.class, NUM_THREADS * NUM_TASKS_PER_THREAD)
            .sample(false, 0.01, 123)
            .mapToPair(
                new HadoopToSimpleChainMapper()) // convert input to <pdbId.chainId, protein
                                                 // sequence> pairs
            .filter(t -> t._2.isProtein())
            .mapToPair(t -> new Tuple2<String, Point3d[]>(t._1, t._2.getCoordinates()))
            .collect();

    for (Tuple2<String, Point3d[]> t : coordinates) {
      System.out.println(t._1 + ": " + Arrays.toString(t._2));
    }

    sc.close();

    System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec.");
  }
  /**
   * @param args Path of the hadoop sequence file
   * @throws FileNotFoundException
   */
  public static void main(String[] args) throws FileNotFoundException {
    String path = args[0];
    JavaSparkContext sc = getSparkContext();
    // sc is an existing JavaSparkContext.
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy");
    sqlContext.setConf("spark.sql.parquet.filterPushdown", "true");
    long start = System.nanoTime();
    // read sequence file and map
    JavaRDD<Row> rowRDD =
        sc.sequenceFile(path, Text.class, Text.class)
            // .sample(false, 0.01, 123)
            .mapToPair(t -> new Tuple2<String, String>(t._1.toString(), t._2.toString()))
            .groupByKey()
            .map(new HadoopToParqRow())
            .cache();

    List<StructField> fields =
        new ArrayList<StructField>(); // create data fields of features for the DataFrame
    fields.add(DataTypes.createStructField("index", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("chainId1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("chainId2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Rnum1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Rnum2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Ins1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("Ins2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("res1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("res2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("atom1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("atom2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("element1", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("element2", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("distance", DataTypes.IntegerType, false));
    fields.add(
        DataTypes.createStructField(
            "pdbId", DataTypes.createArrayType(DataTypes.StringType), false));
    StructType schema = DataTypes.createStructType(fields);

    // Apply the schema to the RDD.
    DataFrame dataFrame = sqlContext.createDataFrame(rowRDD, schema);
    dataFrame
        .coalesce(1)
        .write()
        .mode(SaveMode.Overwrite)
        .partitionBy("index")
        .parquet("/Users/hina/Data/ExampleFiles/seq.parquet");
    sc.close();
    System.out.println("Time: " + (System.nanoTime() - start) / 1E9 + " sec.");
  }
Exemple #8
0
  private static void trainAdaBoost() {
    int imgSize = 24;
    int T = 300;
    // String root = "/home/hadoop/ProgramDatas/MLStudy/FaceDection/";
    String root = "E:/TestDatas/MLStudy/FaceDection/";
    String dataFile = root + "train_data.txt";
    String modelFile = root + "adaboost_model.txt";
    String sparkAppName = "Viola-Jones Train";
    String sparkMaster = "spark://localhost:7077";
    int sparkCores = 60;
    String sparkJars = "/home/hadoop/violajones.jar";

    checkMemoryInfo();

    System.out.println("initing feature templates...");
    List<FeatureTemplate> templates = FeatureTemplate.initFeaTemplates();

    System.out.println("initing features...");
    List<HaarLikeFeature> features = HaarLikeFeature.initFeatures(imgSize, imgSize, templates);

    System.out.println("loading train datas...");
    List<IntegralImage> trainDatas = FileUtils.loadTrainDatas(dataFile, imgSize, imgSize);
    Collections.shuffle(trainDatas);

    SparkConf sparkConf =
        new SparkConf()
            .setMaster(sparkMaster)
            .setAppName(sparkAppName)
            .set("spark.executor.memory", "2g");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    sc.addJar(sparkJars);
    sc.setLogLevel("WARN");

    System.out.println("training adaboost...");
    AdaBoost adaBoost = new AdaBoost(trainDatas, features);
    Map<HaarLikeFeature, Double> classifiers = adaBoost.train(sc, sparkCores, T);

    System.out.println("exporting model...");
    List<String> model = new ArrayList<String>();
    for (Entry<HaarLikeFeature, Double> item : classifiers.entrySet()) {
      model.add(item.getKey().toStringWithWeight(item.getValue()));
    }
    FileUtils.exportFile(modelFile, model);

    System.out.println("viola jones training success!");
    sc.close();
  }
Exemple #9
0
  private static void trainCascadeAdaBoost() {
    int imgSize = 30;
    double eachDR = 0.99, eachFAR = 0.5, finalFAR = 0;

    String root = "/home/hadoop/ProgramDatas/MLStudy/FaceDection/";
    // String root = "E:/TestDatas/MLStudy/FaceDection/";
    String dataFile = root + "train_data_2.txt";
    String modelFile = root + "CascadeAdaboost_model.txt";
    String misclassificationFile = root + "mis_classifications.txt";
    String sparkAppName = "Viola-Jones Train";
    String sparkMaster = "spark://localhost:7077";
    int sparkCores = 60;
    String sparkJars = "/home/hadoop/violajones.jar";

    checkMemoryInfo();

    System.out.println("initing feature templates...");
    List<FeatureTemplate> templates = FeatureTemplate.initFeaTemplates();

    System.out.println("initing features...");
    List<HaarLikeFeature> features = HaarLikeFeature.initFeatures(imgSize, imgSize, templates);

    System.out.println("loading train datas...");
    Map<Integer, List<IntegralImage>> trainDatas =
        FileUtils.loadTrainDatasSeparate(dataFile, imgSize, imgSize);
    List<IntegralImage> posDatas = trainDatas.get(1);
    List<IntegralImage> negDatas = trainDatas.get(0);

    SparkConf sparkConf =
        new SparkConf()
            .setMaster(sparkMaster)
            .setAppName(sparkAppName)
            .set("spark.executor.memory", "3g");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    sc.addJar(sparkJars);
    sc.setLogLevel("WARN");

    System.out.println("training cascade adaboost...");
    CascadeAdaBoost cascade = new CascadeAdaBoost(posDatas, negDatas, features);
    CascadeClassifier classifier =
        cascade.train(sc, sparkCores, eachDR, eachFAR, finalFAR, modelFile, misclassificationFile);

    System.out.println("exporting model...");
    FileUtils.exportFile(modelFile, classifier.exportModel());

    sc.close();
  }
  public static void main(String[] args) {
    // Create a Spark Context.
    SparkConf conf = new SparkConf().setAppName("Activity").set("spark.eventLog.enabled", "true");
    ;
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaStreamingContext jssc = new JavaStreamingContext(sc, STREAM_INTERVAL);
    String TOPIC = "activityevent";
    String zkQuorum = "localhost:2181";
    String group = "1";
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    topicMap.put(TOPIC, 1);

    JavaPairReceiverInputDStream<String, String> messages =
        KafkaUtils.createStream(jssc, zkQuorum, group, topicMap);
    // messages.print();
    JavaDStream<String> activitydatastream =
        messages.map(
            new Function<Tuple2<String, String>, String>() {
              @Override
              public String call(Tuple2<String, String> tuple2) {
                return tuple2._2();
              }
            });

    final Long teamWindowDurationMs = Durations.minutes(1).milliseconds();
    JavaDStream<Activity> ActivityEntryDStream = activitydatastream.map(Activity::parseFromLine);
    JavaPairDStream<WithTimestamp<String>, Double> ActivityWindowDStream =
        ActivityEntryDStream.mapToPair(
                windows ->
                    new Tuple2<>(
                        WithTimestamp.create(
                            windows.getActivity(),
                            // Apply Fixed Window by rounding the timestamp down to the nearest
                            // multiple of the window size
                            (convertMillsecs(windows.getTimestamp()) / teamWindowDurationMs)
                                * teamWindowDurationMs),
                        windows.getXaxis()))
            .reduceByKey(SUM_REDUCER);

    ActivityWindowDStream.print();

    jssc.start();
    jssc.awaitTermination();
    // jssc.close();
    sc.stop();
    sc.close();
  }
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Example_2_2").setMaster("local[2]");
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<String> lines = sc.parallelize(Arrays.asList("hello world", "hi"));
    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              public Iterable<String> call(String line) {
                return Arrays.asList(line.split(" "));
              }
            });
    String first = words.first();
    System.out.println(first);

    sc.close();
  }
Exemple #12
0
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setMaster("local").setAppName("My App");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile("src/main/resources/data.txt");

    @SuppressWarnings("serial")
    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) {
                return Arrays.asList(s.split(" "));
              }
            });

    @SuppressWarnings("serial")
    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    @SuppressWarnings("serial")
    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    List<Tuple2<String, Integer>> output = counts.collect();

    for (Tuple2<?, ?> tuple : output) {
      System.out.println(tuple._1() + "-> " + tuple._2());
    }
    sc.close();
  }
Exemple #13
0
  public static void main(String[] args) {
    SparkConf sparkconf =
        new SparkConf()
            .setAppName("Simple Application")
            .setMaster("spark://1.245.77.10:7077")
            .set(
                "spark.driver.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set(
                "spark.executor.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set("fs.default.name", "file:///");
    JavaSparkContext sc = new JavaSparkContext(sparkconf);
    Configuration hadoopConfig = sc.hadoopConfiguration();
    hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw-0.2.jar");
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw4spark-0.2.jar");

    /*JavaRDD<Double> matrdd2 = sc.parallelize(Arrays.asList(1.0, 3.0, 2.0));
    System.out.println("Start counting parallelize...");
    long values = matrdd2.count();
    System.out.println("Value count of parallelize is " + values);*/

    JavaPairRDD<Long, Double> matrdd =
        sc.newAPIHadoopFile(
            "e:/tmp/vecRow03_x256.mat",
            JMATFileInputFormat.class,
            Long.class,
            Double.class,
            hadoopConfig);
    System.out.println("Start job...");
    long values = matrdd.count();
    System.out.println("Value count of hadoop is " + values);

    sc.stop();
    sc.close();
  }
  public static void main(String[] args) {
    String master = args.length > 0 ? args[0] : "local[4]";

    JavaSparkContext sc = new JavaSparkContext(master, SparkSumByFold.class.getSimpleName());

    JavaRDD<Integer> oddNos = sc.parallelize(Arrays.asList(1, 3, 5, 3, 7, 7, 9, 1));
    JavaRDD<Integer> evenNos = sc.parallelize(Arrays.asList(2, 4, 2, 2, 2, 6, 0, 4));

    JavaRDD<Integer> allNos = oddNos.union(evenNos);

    System.out.println("printing all nos.");

    // print the all nos.
    allNos.foreach(
        new VoidFunction<Integer>() {

          @Override
          public void call(Integer t) throws Exception {
            System.out.println(t);
          }
        });

    sc.close();
  }
  public static void main(String[] args) throws FileNotFoundException {

    if (args.length <= 0) {
      System.out.println(
          "We require input file path, output file path and number of partitions argument to proceed further.");
      System.out.println(
          "Usage: java FarthestPair <input file path> <output file path> <noOfPartitions>");
      System.exit(0);
    }

    String inputFile = args[0];

    SparkConf conf = new SparkConf().setAppName("Group6-FarthestPair");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Read file as RDD
    JavaRDD<String> inputData = sc.textFile(inputFile);
    // JavaRDD<Coordinate> coordinates = inputData.mapPartitions(parseData);

    // Map each String in the file as a coordinate object
    JavaRDD<Coordinate> coordinates = inputData.map(parseData); // .repartition(noOfPartitions);

    // Map to a tuple to sort the Points
    JavaPairRDD<Coordinate, Boolean> pointTupleRDD =
        coordinates.mapToPair(new CoordinatePairFunction());

    // Sort the points
    JavaPairRDD<Coordinate, Boolean> sortedPointTupleRDD =
        pointTupleRDD.sortByKey(new CoordinateComparator());

    // Map to points RDD
    JavaRDD<Coordinate> finalSortedPointRDD =
        sortedPointTupleRDD.map(new TupleToCoordinateMapFunction());

    // Convert sorted collection to RDD

    // Perform Convex hull operation on individual partition
    JavaRDD<Coordinate> localHull = finalSortedPointRDD.mapPartitions(new hull());

    // Repartition to 1 partition in order to apply 'convex hull' on all the Coordinate objects
    // obtained from individual partitions
    JavaRDD<Coordinate> calculatedHull = localHull.coalesce(1).cache();

    // Perform Convex hull operation
    JavaRDD<Coordinate> globalHull = calculatedHull.mapPartitions(new hull()).distinct();

    JavaPairRDD<Coordinate, Coordinate> allCoordinateTuples = globalHull.cartesian(globalHull);
    System.out.println("Total cart: " + allCoordinateTuples.collect().size());

    JavaRDD<Pair> pairsRDD =
        allCoordinateTuples.map(
            new Function<Tuple2<Coordinate, Coordinate>, Pair>() {

              public Pair call(Tuple2<Coordinate, Coordinate> tuple) throws Exception {
                // TODO Auto-generated method stub
                Coordinate pointA = tuple._1();
                Coordinate pointB = tuple._2();
                Pair a = new Pair(pointA, pointB);
                return a;
              }
            });

    JavaRDD<Pair> pairs =
        allCoordinateTuples.mapPartitions(
            new FlatMapFunction<Iterator<Tuple2<Coordinate, Coordinate>>, Pair>() {

              /** */
              private static final long serialVersionUID = 1L;

              public Iterable<Pair> call(Iterator<Tuple2<Coordinate, Coordinate>> tuples)
                  throws Exception {
                // TODO Auto-generated method stub
                List<Pair> pairsFromTuples = new ArrayList<Pair>();
                // Pair singlePair = new Pair();
                Tuple2<Coordinate, Coordinate> tuple;
                while (tuples.hasNext()) {
                  tuple = tuples.next();

                  // singlePair.A = tuples.next()._1;
                  // singlePair.B = tuples.next()._2;
                  Pair singlePair = new Pair(tuple._1(), tuple._2());
                  pairsFromTuples.add(singlePair);
                }
                return pairsFromTuples;
              }
            });

    JavaRDD<Integer> x =
        pairsRDD.mapPartitions(
            new FlatMapFunction<Iterator<Pair>, Integer>() {

              public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception {
                // TODO Auto-generated method stub
                ArrayList<Integer> x = new ArrayList<Integer>();
                x.add(1);
                return x;
              }
            });

    System.out.println("Num of partitions: " + x.collect());

    JavaRDD<Integer> y =
        pairs.mapPartitions(
            new FlatMapFunction<Iterator<Pair>, Integer>() {

              public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception {
                // TODO Auto-generated method stub
                ArrayList<Integer> x = new ArrayList<Integer>();
                x.add(1);
                return x;
              }
            });

    System.out.println("Num of partitions charan: " + y.collect());

    Pair minDistPair =
        pairs.reduce(
            new Function2<Pair, Pair, Pair>() {
              /** */
              private static final long serialVersionUID = 1L;

              public Pair call(Pair a, Pair b) throws Exception {
                // TODO Auto-generated method stub

                return (a.distanceLength > b.distanceLength ? a : b);
              }
            });

    // System.out.println(minDistPair);

    Coordinate closestpointA = minDistPair.A;
    Coordinate closestpointB = minDistPair.B;

    List<Coordinate> closestPoints = new ArrayList<Coordinate>();
    closestPoints.add(closestpointA);
    closestPoints.add(closestpointB);

    JavaRDD<Coordinate> closestRDD = sc.parallelize(closestPoints);

    // Map to a tuple to sort the Points
    JavaPairRDD<Coordinate, Boolean> coordinateTupleRDD =
        closestRDD.mapToPair(new CoordinatePairFunction());

    // Sort the points
    JavaPairRDD<Coordinate, Boolean> sortedCoordinateTupleRDD =
        coordinateTupleRDD.sortByKey(new CoordinateComparator());

    // Map to points RDD
    JavaRDD<Coordinate> finalSortedCoordinateRDD =
        sortedCoordinateTupleRDD.map(new TupleToCoordinateMapFunction());

    JavaRDD<String> outputData = finalSortedCoordinateRDD.map(parseOutputData);
    // closestRDD.saveAsTextFile(outputfilepath);
    outputData.saveAsTextFile(args[1]);

    // Output your result, you need to sort your result!!!
    // And,Don't add a additional clean up step delete the new generated
    // file...
    sc.close();
  }
  public static void main(String[] args) throws IOException {
    Parameters param = new Parameters();
    long initTime = System.currentTimeMillis();

    SparkConf conf = new SparkConf().setAppName("StarJoin");
    JavaSparkContext sc = new JavaSparkContext(conf);

    if (param.useKryo) {
      conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
      conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName());
      conf.set("spark.kryoserializer.buffer.mb", param.buffer);
    }

    MyBloomFilter.BloomFilter<String> BFS =
        new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes);
    MyBloomFilter.BloomFilter<String> BFD =
        new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes);
    MyBloomFilter.BloomFilter<String> BFC =
        new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes);

    JavaPairRDD<String, String> supps =
        sc.textFile(param.suppPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> s = supps.collect();
    for (int i = 0; i < s.size(); i++) {
      BFS.add(s.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS);

    JavaPairRDD<String, String> custs =
        sc.textFile(param.custPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> c = custs.collect();
    for (int i = 0; i < c.size(); i++) {
      BFC.add(c.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC);

    JavaPairRDD<String, String> dates =
        sc.textFile(param.datePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[6].equals("Dec1997");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[4]);
                  }
                });

    List<Tuple2<String, String>> d = dates.collect();
    for (int i = 0; i < d.size(); i++) {
      BFD.add(d.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD);

    JavaPairRDD<String, String[]> lines =
        sc.textFile(param.linePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return varC.value().contains(s[2].getBytes())
                        & varS.value().contains(s[4].getBytes())
                        & varD.value().contains(s[5].getBytes());
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String[]>() {
                  public Tuple2<String, String[]> call(String[] s) {
                    String[] v = {s[2], s[5], s[12]};
                    return new Tuple2<String, String[]>(s[4], v);
                  }
                });

    JavaPairRDD<String, String[]> result =
        lines
            .join(supps)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    result =
        result
            .join(custs)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    JavaPairRDD<String, Long> final_result =
        result
            .join(dates)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) {
                    return new Tuple2<String, Long>(
                        s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0]));
                  }
                })
            .reduceByKey(
                new Function2<Long, Long, Long>() {
                  public Long call(Long i1, Long i2) {
                    return i1 + i2;
                  }
                });

    JavaPairRDD<String, String> sub_result =
        final_result.mapToPair(
            new PairFunction<Tuple2<String, Long>, String, String>() {
              public Tuple2<String, String> call(Tuple2<String, Long> line) {
                return new Tuple2(line._1 + "," + line._2.toString(), null);
              }
            });

    final_result =
        sub_result
            .sortByKey(new Q3Comparator())
            .mapToPair(
                new PairFunction<Tuple2<String, String>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, String> line) {
                    String[] s = line._1.split(",");
                    return new Tuple2<String, Long>(
                        s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3]));
                  }
                });

    Configuration HDFSconf = new Configuration();
    FileSystem fs = FileSystem.get(HDFSconf);
    fs.delete(new Path(param.output), true);

    final_result.saveAsTextFile(param.output);

    long finalTime = System.currentTimeMillis();
    System.out.print("Tempo total(ms): ");
    System.out.println(finalTime - initTime);

    sc.close();
  }
  public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("SpatialJoinQuery Application");

    JavaSparkContext sc = new JavaSparkContext(conf);
    // Read the input csv file holding set of polygons in a rdd of string
    // objects
    JavaRDD<String> firstInputPoints =
        sc.textFile("hdfs://192.168.139.149:54310/harsh/spatialJoinFirstInput.csv");

    // Map the above rdd of strings to a rdd of rectangles for the first
    // input

    // Repeat the above process but now for initializing the rdd for query
    // window
    JavaRDD<String> secondInputPoints =
        sc.textFile("hdfs://192.168.139.149:54310/harsh/spatialJoinSecondInput.csv");

    JavaRDD<Tuple2<Integer, ArrayList<Integer>>> joinQueryRDD =
        new JavaRDD<Tuple2<Integer, ArrayList<Integer>>>(null, null);

    if (args[0].equalsIgnoreCase("rectangle")) {

      System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
      System.out.println("inside>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");

      final JavaRDD<Rectangle> firstInputRDD = firstInputPoints.map(mapInputStringToRectRDD());
      System.out.println(firstInputRDD.collect());

      // Map the query window to RDD object
      final JavaRDD<Rectangle> secondInputRDD = secondInputPoints.map(mapInputStringToRectRDD());

      // broadcast the second set of rectangles to each of the workers
      final Broadcast<List<Rectangle>> firstInput = sc.broadcast(firstInputRDD.collect());
      // map the id of first input to the multiple id’s of the second
      // input if
      // they contain the
      // first rectangle.
      joinQueryRDD =
          secondInputRDD.map(
              new Function<Rectangle, Tuple2<Integer, ArrayList<Integer>>>() {
                public Tuple2<Integer, ArrayList<Integer>> call(Rectangle rectangle)
                    throws Exception {
                  // Get the list of rectangles from the second RDD input.
                  List<Rectangle> firstInputCollection = firstInput.value();
                  ArrayList<Integer> secondInputIds = new ArrayList<Integer>();
                  // Iterate the second input and check for the second set
                  // of rectangle id’s
                  // that hold the rectangle from first set obtained from
                  // the mapped RDD
                  for (Rectangle firstRects : firstInputCollection) {
                    if (rectangle.isRectangleinsideQueryWindow(firstRects)) {
                      secondInputIds.add(firstRects.getRectangleId());
                    }
                  }
                  // Create a new tuple of the mapped values and return
                  // back the mapped
                  // transformation.
                  Tuple2<Integer, ArrayList<Integer>> resultList =
                      new Tuple2<Integer, ArrayList<Integer>>(
                          rectangle.getRectangleId(), secondInputIds);
                  return resultList;
                }
              });

    } else if (args[0].equalsIgnoreCase("point")) {

      final JavaRDD<Point> firstInputRDD =
          firstInputPoints.map(SpatialRangeQuery.mapInputStringToPointRDD());

      // broadcast the second set of rectangles to each of the workers
      final Broadcast<List<Point>> firstInput = sc.broadcast(firstInputRDD.collect());

      // Map the query window to RDD object
      final JavaRDD<Rectangle> secondInputRDD = secondInputPoints.map(mapInputStringToRectRDD());

      joinQueryRDD =
          secondInputRDD.map(
              new Function<Rectangle, Tuple2<Integer, ArrayList<Integer>>>() {
                public Tuple2<Integer, ArrayList<Integer>> call(Rectangle rectangle)
                    throws Exception {
                  // Get the list of rectangles from the second RDD input.
                  List<Point> firstInputCollection = firstInput.getValue();
                  ArrayList<Integer> secondInputIds = new ArrayList<Integer>();
                  // Iterate the second input and check for the second set
                  // of rectangle id’s
                  // that hold the rectangle from first set obtained from
                  // the mapped RDD
                  for (Point point : firstInputCollection) {
                    if (point.isPointinsideQueryWindow(rectangle)) {
                      secondInputIds.add(point.getPointID());
                    }
                  }
                  // Create a new tuple of the mapped values and return
                  // back the mapped
                  // transformation.
                  Tuple2<Integer, ArrayList<Integer>> resultList =
                      new Tuple2<Integer, ArrayList<Integer>>(
                          rectangle.getRectangleId(), secondInputIds);
                  return resultList;
                }
              });
    }

    JavaRDD<String> result =
        joinQueryRDD.map(
            new Function<Tuple2<Integer, ArrayList<Integer>>, String>() {
              public String call(Tuple2<Integer, ArrayList<Integer>> inputPoint) {

                Integer containingRect = inputPoint._1();
                ArrayList<Integer> containedRects = inputPoint._2();

                StringBuffer intermediateBuffer = new StringBuffer();

                intermediateBuffer.append(containingRect);

                for (Integer rects : containedRects) {
                  intermediateBuffer.append(", " + rects);
                }

                return intermediateBuffer.toString();
              }
            });

    result.coalesce(1).saveAsTextFile("hdfs://192.168.139.149:54310/harsh/jQueryResult.csv");

    sc.close();
  }