public static void main(String args[]) {
   SparkConf conf = new SparkConf().setAppName("KeyValueTest").setMaster("local");
   JavaSparkContext jsc = new JavaSparkContext(conf);
   JavaRDD<String> lines = jsc.textFile("/home/piyushm/samplejson.json");
   List<Person> persons = lines.mapPartitions(new ParseJson()).collect();
   JavaRDD<Person> personJavaRDD = jsc.parallelize(persons);
   JavaRDD<String> csvFileContent = jsc.textFile("/opt/sample.csv");
   System.out.println(csvFileContent.map(new ParseLine()).collect());
   System.out.println(persons);
   System.out.println(personJavaRDD.mapPartitions(new WriteJson()).collect());
   jsc.stop();
 }
Example #2
0
  public static void main(String[] args) {
    if (args.length < 2) {
      System.err.println("Usage: KMeansMP <input_file> <results>");
      System.exit(1);
    }
    String inputFile = args[0];
    String results_path = args[1];
    JavaPairRDD<Integer, Iterable<String>> results;
    int k = 4;
    int iterations = 100;
    int runs = 1;
    long seed = 0;
    final KMeansModel model;

    SparkConf sparkConf = new SparkConf().setAppName("KMeans MP");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<String> lines = sc.textFile(inputFile);

    JavaRDD<Vector> points = lines.map(new ParsePoint());
    JavaRDD<String> titles = lines.map(new ParseTitle());

    model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.RANDOM(), 0);

    results = titles.zip(points).mapToPair(new ClusterCars(model)).groupByKey();

    results.saveAsTextFile(results_path);

    sc.stop();
  }
Example #3
0
  public static void main(String[] args) {
    String logFile;

    if (args.length != 0) logFile = args[0];
    else logFile = "/media/gf/Java/spark-1.4.0-bin-hadoop2.6/README.md";

    final SparkConf conf = new SparkConf().setAppName("Simple Application");

    final JavaSparkContext sc = new JavaSparkContext(conf);
    final JavaRDD<String> logData = sc.textFile(logFile).cache();

    final String[] check = getFilterSet();

    System.out.println("Start: " + new Date());
    for (int i = 0; i < check.length; i++) {
      final int post = i;
      long count =
          logData
              .filter(
                  new Function<String, Boolean>() {
                    public Boolean call(String s) {
                      return s.contains(check[post]);
                    }
                  })
              .count();

      System.out.println("Lines with " + check[i] + ": " + count);
    }
    System.out.println("End: " + new Date());

    sc.close();
  }
Example #4
0
 @SuppressWarnings("serial")
 @Override
 public SortedCounts<String> execute(final JavaSparkContext spark) {
   final JavaRDD<String> textFile = spark.textFile(inputFile);
   final JavaRDD<String> words =
       textFile.flatMap(
           new FlatMapFunction<String, String>() {
             @Override
             public Iterable<String> call(final String rawJSON) throws TwitterException {
               final Status tweet = TwitterObjectFactory.createStatus(rawJSON);
               String text = tweet.getText();
               return Arrays.asList(text.split(" "));
             }
           });
   final JavaPairRDD<String, Integer> pairs =
       words.mapToPair(
           new PairFunction<String, String, Integer>() {
             @Override
             public Tuple2<String, Integer> call(final String s) {
               return new Tuple2<String, Integer>(s.toLowerCase(), 1);
             }
           });
   final JavaPairRDD<String, Integer> counts =
       pairs.reduceByKey(
           new Function2<Integer, Integer, Integer>() {
             @Override
             public Integer call(final Integer a, final Integer b) {
               return a + b;
             }
           });
   return SortedCounts.create(counts);
 }
  public static void main(String[] args) {
    // Create a java spark context
    SparkConf conf = new SparkConf().setAppName("Accumulators");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Create an accumulator to keep track of number of blank lines in callSigns.txt
    final Accumulator<Integer> blankLines = sc.accumulator(0);

    JavaRDD<String> input = sc.textFile("src/main/resources/callSigns.txt");

    JavaRDD<String> callSigns =
        input.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) throws Exception {
                if (s.equals("")) {
                  blankLines.add(1);
                }
                return Arrays.asList(s.split(" "));
              }
            });

    callSigns.saveAsTextFile("Chapter5-Output");
    System.out.println("Number of blank lines present in text file : " + blankLines);
  }
Example #6
0
  public static void main(String[] args) {
    String logFile = "YOUR_SPARK_HOME/README.md"; // Should be some file on your system
    SparkConf conf = new SparkConf().setAppName("Simple Application");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> logData = sc.textFile(logFile).cache();

    long numAs =
        logData
            .filter(
                new Function<String, Boolean>() {
                  public Boolean call(String s) {
                    return s.contains("a");
                  }
                })
            .count();

    long numBs =
        logData
            .filter(
                new Function<String, Boolean>() {
                  public Boolean call(String s) {
                    return s.contains("b");
                  }
                })
            .count();

    System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
  }
  public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaLogQuery");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    JavaRDD<String> dataSet =
        (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

    JavaPairRDD<Tuple3<String, String, String>, Stats> extracted =
        dataSet.mapToPair(
            new PairFunction<String, Tuple3<String, String, String>, Stats>() {
              @Override
              public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
                return new Tuple2<Tuple3<String, String, String>, Stats>(
                    extractKey(s), extractStats(s));
              }
            });

    JavaPairRDD<Tuple3<String, String, String>, Stats> counts =
        extracted.reduceByKey(
            new Function2<Stats, Stats, Stats>() {
              @Override
              public Stats call(Stats stats, Stats stats2) {
                return stats.merge(stats2);
              }
            });

    List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
    for (Tuple2<?, ?> t : output) {
      System.out.println(t._1() + "\t" + t._2());
    }
    jsc.stop();
  }
Example #8
0
  public static void main(String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
    JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new SQLContext(javaSparkContext);

    System.out.println("=== Data source: RDD ===");
    // Load a text file and convert each line to a Java Bean.
    JavaRDD<Person> people =
        javaSparkContext
            .textFile("people.txt")
            .map(
                (line) -> {
                  String[] parts = line.split(",");
                  Person person = new Person();
                  person.setName(parts[0]);
                  person.setAge(parts[1]);
                  return person;
                });

    // Apply a schema to an RDD of Java Beans and register it as a table.
    DataFrame dataFrame = sqlContext.createDataFrame(people, Person.class);
    dataFrame.registerTempTable("people");

    // SQL can be run over RDDs that have been registered as tables.
    DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // 通过DataFrame 获取对应的RDD collection 获取对应的结果内容
    List<String> teenagersName =
        teenagers.toJavaRDD().map((row) -> "Name : " + row.getString(0)).collect();

    teenagersName.forEach(
        (name) -> {
          System.out.println(name);
        });
  }
  public static void main(String[] args) {
    String master = args[0];
    String appName = args[1];
    String path = args[2];

    SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<String> lines =
        sc.textFile(path)
            .filter(
                new Function<String, Boolean>() {
                  @Override
                  public Boolean call(String s) throws Exception {
                    return !s.isEmpty() && !s.contains("Total");
                  }
                });

    JavaRDD<String> usOnly =
        lines.filter(
            new Function<String, Boolean>() {
              @Override
              public Boolean call(String s) throws Exception {
                return s.contains("United States");
              }
            });

    JavaPairRDD<String, Integer> yearAndMedals =
        usOnly.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) throws Exception {
                String[] fields = s.split(",");
                return new Tuple2<String, Integer>(fields[3], Integer.parseInt(fields[8]));
              }
            });

    JavaPairRDD<String, Integer> reduced =
        yearAndMedals.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer accumulator, Integer currentValue) throws Exception {
                return accumulator + currentValue;
              }
            });

    JavaPairRDD<String, Integer> result =
        reduced.filter(
            new Function<Tuple2<String, Integer>, Boolean>() {
              @Override
              public Boolean call(Tuple2<String, Integer> tuple) throws Exception {
                return tuple._2 < 200;
              }
            });

    System.out.println();
    System.out.println(result.collect());
  }
Example #10
0
  public static void main(String[] args) {

    if (args.length == 0) {
      System.err.println("Usage: Main <file>");
      System.exit(1);
    }

    SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile(args[0]);

    JavaPairRDD<String, Double> dayArrivalDelayPair =
        lines.flatMapToPair(
            line -> {
              String[] splitLine = line.split(SPLIT_PATTERN);
              String key = splitLine.length == 0 ? "" : splitLine[0];
              Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]);
              return Arrays.asList(new Tuple2<>(key, value));
            });

    JavaPairRDD<String, AverageWrapper> dayAverageWrapper =
        dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1));

    JavaPairRDD<String, AverageWrapper> daysValueCount =
        dayAverageWrapper.reduceByKey(
            (aw1, aw2) ->
                new AverageWrapper(
                    aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount()));

    Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap();
    List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>();
    listResults.addAll(resultMap.entrySet());
    Collections.sort(
        listResults,
        (entry1, entry2) ->
            Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue()));

    for (Map.Entry<String, AverageWrapper> entry : listResults) {
      System.out.printf(
          "%s -> (%f, %d)\n",
          entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount());
    }

    //        JavaPairRDD<String, Double> resultRDD =
    //                daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() /
    // averageWrapper.getCount());
    //
    //        Map<String, Double> results = resultRDD.collectAsMap();

    //        List<Map.Entry<String, Double>> listResults = new ArrayList<>();
    //        listResults.addAll(results.entrySet());
    //        Collections.sort(listResults, (entry1, entry2) ->
    // entry1.getValue().compareTo(entry2.getValue()));
    //
    //        for (Map.Entry<String, Double> entry : listResults) {
    //            System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue());
    //        }
  }
Example #11
0
  public static void main(String[] args) {
    JavaSparkContext sc = new JavaSparkContext("local", "JavaAPISuite");

    JavaRDD<String> lines = sc.textFile("log.txt");
    JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")));
    JavaPairRDD<String, Integer> counts =
        words.mapToPair(w -> new Tuple2<String, Integer>(w, 1)).reduceByKey((x, y) -> x + y);

    counts.collect().forEach(t -> System.out.println("Key:" + t._1() + " Value:" + t._2()));
  }
  /** Load the data from the json file and return an RDD of Tweet */
  public JavaRDD<Tweet> loadData() {
    // create spark configuration and spark context
    SparkConf conf = new SparkConf().setAppName("Tweet mining").setMaster("local[*]");

    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(line -> Parse.parseJsonToTweet(line));

    return tweets;
  }
  public static void main(String[] args) {
    if (args.length < 2) {
      System.err.println("Usage: NaiveBayesExample <training_data> <test_data>");
      System.exit(1);
    }
    String training_data_path = args[0];
    // https://class.coursera.org/cloudapplications-001/forum/thread?thread_id=1387
    // String test_data_path = args[0];
    String test_data_path = args[1];

    SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<LabeledPoint> train = sc.textFile(training_data_path).map(new DataToPoint());
    // JavaRDD<LabeledPoint> test = sc.textFile(training_data_path).map(new DataToPoint());
    JavaRDD<LabeledPoint> test = sc.textFile(test_data_path).map(new DataToPoint());

    final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0);

    JavaPairRDD<Double, Double> predictionAndLabel =
        test.mapToPair(
            new PairFunction<LabeledPoint, Double, Double>() {
              public Tuple2<Double, Double> call(LabeledPoint p) {
                return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
              }
            });

    double accuracy =
        predictionAndLabel
                .filter(
                    new Function<Tuple2<Double, Double>, Boolean>() {
                      public Boolean call(Tuple2<Double, Double> pl) {
                        return pl._1().equals(pl._2());
                      }
                    })
                .count()
            / (double) test.count();

    System.out.println(accuracy);

    sc.stop();
  }
  public static void main(String[] args) {

    // Handle invalid arguments..
    if (args.length < 2) {
      System.out.println("Usage: ConvexHull arg1 arg2");
      System.out.println("arg1: input dataset A file path [points]");
      System.out.println("arg2: output file name and path");
      System.exit(1);
    }

    // Creating and setting sparkconf
    SparkConf sparkConf = new SparkConf().setAppName("Group3-edu.asu.cse512.ConvexHull");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Adding external jars
    // sc.addJar("lib/jts-1.13.jar");

    JavaRDD<String> lines = sc.textFile(args[0]);
    // Using mapPartitions function to find convex hull points in distributed environment
    JavaRDD<Coordinate> hullPointsRDD = lines.mapPartitions(new ConvexH());
    List<Coordinate> hullPointsList = hullPointsRDD.collect();
    Coordinate[] inputArray = new Coordinate[hullPointsList.size()];
    int j = 0;
    for (Coordinate c : hullPointsList) {
      inputArray[j] = c;
      j++;
    }
    // Finding convex hull points on the final subset of points retrieved from distributed
    // environment
    GeometryFactory geoFactory1 = new GeometryFactory();
    MultiPoint mPoint1 = geoFactory1.createMultiPoint(inputArray);
    Geometry geo1 = mPoint1.convexHull();
    Coordinate[] convexHullResult = geo1.getCoordinates();
    int length = convexHullResult.length;
    Coordinate[] convexHullFinalResult = Arrays.copyOf(convexHullResult, length - 1);
    Arrays.sort(convexHullFinalResult);

    // Converting the list of coordinates into Coordinate RDD
    JavaRDD<Coordinate> convexHullResultRDD =
        sc.parallelize(Arrays.asList(convexHullFinalResult), 1);
    JavaRDD<String> convexHullResultString =
        convexHullResultRDD
            .repartition(1)
            .map(
                new Function<Coordinate, String>() {
                  public String call(Coordinate hullPoint) throws Exception {

                    return hullPoint.x + "," + hullPoint.y;
                  }
                });
    // Save the String RDD into text file. Using repartition(1) to preserve the order of coordinates
    convexHullResultString.repartition(1).saveAsTextFile(args[1]);
  }
Example #15
0
  public static void main(String[] args) {
    JavaSparkContext sc = new JavaSparkContext("local", "test-java-client");
    //        JavaSparkContext sc = new JavaSparkContext("spark://192.168.181.23:7077",
    // "test-java-client");
    JavaRDD<String> textFile =
        sc.textFile("/home/congsl/apps/storm/dockerfile-repository/nginx/Dockerfile");
    Map<String, Integer> result =
        textFile
            .flatMap(
                new FlatMapFunction<String, Object>() {
                  @Override
                  public Iterable<Object> call(String s) throws Exception {
                    System.out.println(s);
                    return Arrays.asList(s.split(" "));
                  }
                })
            .map(
                new Function<Object, Map<String, Integer>>() {
                  @Override
                  public Map<String, Integer> call(Object v1) throws Exception {
                    System.out.println(v1);
                    Map<String, Integer> map = new HashMap<String, Integer>();
                    map.put(v1.toString(), 1);
                    return map;
                  }
                })
            .reduce(
                new Function2<Map<String, Integer>, Map<String, Integer>, Map<String, Integer>>() {
                  @Override
                  public Map<String, Integer> call(Map<String, Integer> v1, Map<String, Integer> v2)
                      throws Exception {
                    System.out.println("v1:" + v1);
                    System.out.println("v2:" + v2);
                    for (String key : v2.keySet()) {
                      if (v1.get(key) == null) {
                        v1.put(key, v2.get(key));
                      } else {
                        v1.put(key, v1.get(key) + v2.get(key));
                      }
                    }
                    return v1;
                  }
                });

    System.out.println(result);
    System.out.println(textFile.count());
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    DataFrame df =
        sqlContext.jdbc(
            "jdbc:mysql://localhost:3306/activiti?user=root&password=admin", "ACT_ID_INFO");
    df.show();
    //        JavaSQLContext sqlContext = new JavaSQLContext(sparkContext);
  }
Example #16
0
 public static void main(String[] args) {
   String file = "";
   SparkConf conf = new SparkConf().setAppName("app");
   JavaSparkContext sc = new JavaSparkContext(conf);
   JavaRDD<String> lines = sc.textFile(file);
   JavaRDD<String> errors =
       lines.filter(
           new Function<String, Boolean>() {
             public Boolean call(String x) {
               return x.contains("error");
             }
           });
 }
  /**
   * This method will read a file into a JavaRDD
   *
   * @param fileLocation
   * @param headerRowSkippingCriteria
   * @return
   */
  @SuppressWarnings("serial")
  private static JavaRDD<String> readData(
      String fileLocation, final String headerRowSkippingCriteria) {

    JavaRDD<String> lines = null;
    if (headerRowSkippingCriteria == null) {
      lines = sc.textFile(fileLocation);
    } else {
      lines =
          sc.textFile(fileLocation)
              .filter(
                  new Function<String, Boolean>() {
                    public Boolean call(String line) {
                      if (line.contains(headerRowSkippingCriteria)) {
                        System.out.println(line);
                        return false;
                      } else return !(line.contains(headerRowSkippingCriteria));
                    }
                  });
    }
    return lines;
  }
Example #18
0
  public static JavaRDD<String> getJavaRDD(JavaSparkContext sparkContext) {

    System.out.println("Converting" + sparkContext.version() + sparkContext.appName());

    JavaRDD<String> testJRDD = null;
    try {
      testJRDD =
          sparkContext.textFile(
              "/Users/shawnkyzer/Documents/aleph2_analytic_services_R/hs_err_pid2930.log");
    } catch (Exception e) {
      System.out.println(e.fillInStackTrace());
    }
    System.out.println("Converting");
    return testJRDD;
  }
Example #19
0
  public static void main(String[] args) {

    JavaSparkContext sc = new JavaSparkContext();
    Configuration conf = sc.hadoopConfiguration();
    conf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem");
    conf.set("fs.swift.service.test.auth.url", "http://163.17.136.246:5000/v2.0/tokens");
    conf.set("fs.swift.service.test.auth.endpoint.prefix", "endpoints");
    conf.set("fs.swift.service.test.http.port", "8080");
    conf.set("fs.swift.service.test.region", "RegionOne");
    conf.set("fs.swift.service.test.public", "true");
    conf.set("fs.swift.service.test.tenant", "big-data");
    conf.set("fs.swift.service.test.username", "k753357");
    conf.set("fs.swift.service.test.password", "k753357");
    JavaRDD<String> rawRDD = sc.textFile(args[0]);
    rawRDD.saveAsTextFile("swift://testfile.test/file/");
  }
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Usage: JavaWordCount <master> <file>");
      System.exit(1);
    }

    JavaSparkContext ctx =
        new JavaSparkContext(
            args[0],
            "JavaWordCount",
            System.getenv("SPARK_HOME"),
            JavaSparkContext.jarOfClass(JavaWordCount.class));
    JavaRDD<String> lines = ctx.textFile(args[1], 1);

    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) {
                return Arrays.asList(SPACE.split(s));
              }
            });

    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<?, ?> tuple : output) {
      System.out.println(tuple._1 + ": " + tuple._2);
    }
    System.exit(0);
  }
Example #21
0
  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      System.err.println("Usage: JavaWordCount <input_file> <output_file>");
      System.exit(1);
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterator<String> call(String s) {
                return Arrays.asList(SPACE.split(s)).iterator();
              }
            });

    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    /*
    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<?,?> tuple : output) {
      System.out.println(tuple._1() + ": " + tuple._2());
    }
    */
    counts.saveAsTextFile(args[1]);
    ctx.stop();
  }
Example #22
0
 private static RDD<Tuple2<Integer, double[]>> readFeaturesRDD(
     JavaSparkContext sparkContext, Path path) {
   log.info("Loading features RDD from {}", path);
   JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
   return featureLines
       .map(
           new Function<String, Tuple2<Integer, double[]>>() {
             @Override
             public Tuple2<Integer, double[]> call(String line) throws IOException {
               List<?> update = MAPPER.readValue(line, List.class);
               Integer key = Integer.valueOf(update.get(0).toString());
               double[] vector = MAPPER.convertValue(update.get(1), double[].class);
               return new Tuple2<>(key, vector);
             }
           })
       .rdd();
 }
  public static void main(String[] args) throws Exception {

    Schema schema =
        new Schema.Builder()
            .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width")
            .addColumnInteger("Species")
            .build();

    SparkConf conf = new SparkConf();
    conf.setMaster("local[*]");
    conf.setAppName("DataVec Example");

    JavaSparkContext sc = new JavaSparkContext(conf);

    String directory =
        new ClassPathResource("IrisData/iris.txt")
            .getFile()
            .getParent(); // Normally just define your directory like "file:/..." or "hdfs:/..."
    JavaRDD<String> stringData = sc.textFile(directory);

    // We first need to parse this comma-delimited (CSV) format; we can do this using
    // CSVRecordReader:
    RecordReader rr = new CSVRecordReader();
    JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr));

    int maxHistogramBuckets = 10;
    DataAnalysis dataAnalysis = AnalyzeSpark.analyze(schema, parsedInputData, maxHistogramBuckets);

    System.out.println(dataAnalysis);

    // We can get statistics on a per-column basis:
    DoubleAnalysis da = (DoubleAnalysis) dataAnalysis.getColumnAnalysis("Sepal length");
    double minValue = da.getMin();
    double maxValue = da.getMax();
    double mean = da.getMean();

    HtmlAnalysis.createHtmlAnalysisFile(dataAnalysis, new File("DataVecIrisAnalysis.html"));

    // To write to HDFS instead:
    // String htmlAnalysisFileContents = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis);
    // SparkUtils.writeStringToFile("hdfs://your/hdfs/path/here",htmlAnalysisFileContents,sc);
  }
Example #24
0
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setMaster("local").setAppName("My App");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile("src/main/resources/data.txt");

    @SuppressWarnings("serial")
    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) {
                return Arrays.asList(s.split(" "));
              }
            });

    @SuppressWarnings("serial")
    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    @SuppressWarnings("serial")
    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    List<Tuple2<String, Integer>> output = counts.collect();

    for (Tuple2<?, ?> tuple : output) {
      System.out.println(tuple._1() + "-> " + tuple._2());
    }
    sc.close();
  }
Example #25
0
  public static void wordCountJava8(String filename) {
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile(filename);

    // Java 8 with lambdas: split the input string into words
    JavaRDD<String> words = input.flatMap(s -> Arrays.asList(s.split(" ")));

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count
    // them
    JavaPairRDD<String, Integer> counts =
        words.mapToPair(t -> new Tuple2(t, 1)).reduceByKey((x, y) -> (int) x + (int) y);

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile("output");
  }
  private DataFrame artistsAsDataFrame() {
    String input = TestUtils.sampleArtistsDat();
    JavaRDD<String> data = sc.textFile(input);

    StructType schema =
        DataTypes.createStructType(
            new StructField[] {
              DataTypes.createStructField("id", DataTypes.IntegerType, false),
              DataTypes.createStructField("name", DataTypes.StringType, false),
              DataTypes.createStructField("url", DataTypes.StringType, true),
              DataTypes.createStructField("pictures", DataTypes.StringType, true),
              DataTypes.createStructField("time", DataTypes.TimestampType, true)
            });

    JavaRDD<Row> rowData =
        data.map(
                new Function<String, String[]>() {
                  @Override
                  public String[] call(String line) throws Exception {
                    return line.split("\t");
                  }
                })
            .map(
                new Function<String[], Row>() {
                  @Override
                  public Row call(String[] r) throws Exception {
                    return RowFactory.create(
                        Integer.parseInt(r[0]),
                        r[1],
                        r[2],
                        r[3],
                        new Timestamp(DatatypeConverter.parseDateTime(r[4]).getTimeInMillis()));
                  }
                });

    return sqc.createDataFrame(rowData, schema);
  }
  public static void main(String args[]) {
    SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
    conf.set("es.index.auto.create", "true");
    JavaSparkContext context = new JavaSparkContext(conf);

    JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv");

    JavaRDD<Crime> dataSplits =
        textFile.map(
            line -> {
              CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180);
              Crime c = new Crime();
              CSVRecord record = parser.getRecords().get(0);
              c.setId(record.get(0));
              c.setCaseNumber(record.get(1));
              c.setEventDate(record.get(2));
              c.setBlock(record.get(3));
              c.setIucr(record.get(4));
              c.setPrimaryType(record.get(5));
              c.setDescription(record.get(6));
              c.setLocation(record.get(7));
              c.setArrest(Boolean.parseBoolean(record.get(8)));
              c.setDomestic(Boolean.parseBoolean(record.get(9)));
              String lat = record.get(10);
              String lon = record.get(11);
              Map<String, Double> geoLocation = new HashMap<>();
              geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat));
              geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon));
              c.setGeoLocation(geoLocation);
              return c;
            });

    SQLContext sqlContext = new SQLContext(context);
    DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class);

    JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection");
  }
  public static void main(String[] args) throws FileNotFoundException {

    if (args.length <= 0) {
      System.out.println(
          "We require input file path, output file path and number of partitions argument to proceed further.");
      System.out.println(
          "Usage: java FarthestPair <input file path> <output file path> <noOfPartitions>");
      System.exit(0);
    }

    String inputFile = args[0];

    SparkConf conf = new SparkConf().setAppName("Group6-FarthestPair");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Read file as RDD
    JavaRDD<String> inputData = sc.textFile(inputFile);
    // JavaRDD<Coordinate> coordinates = inputData.mapPartitions(parseData);

    // Map each String in the file as a coordinate object
    JavaRDD<Coordinate> coordinates = inputData.map(parseData); // .repartition(noOfPartitions);

    // Map to a tuple to sort the Points
    JavaPairRDD<Coordinate, Boolean> pointTupleRDD =
        coordinates.mapToPair(new CoordinatePairFunction());

    // Sort the points
    JavaPairRDD<Coordinate, Boolean> sortedPointTupleRDD =
        pointTupleRDD.sortByKey(new CoordinateComparator());

    // Map to points RDD
    JavaRDD<Coordinate> finalSortedPointRDD =
        sortedPointTupleRDD.map(new TupleToCoordinateMapFunction());

    // Convert sorted collection to RDD

    // Perform Convex hull operation on individual partition
    JavaRDD<Coordinate> localHull = finalSortedPointRDD.mapPartitions(new hull());

    // Repartition to 1 partition in order to apply 'convex hull' on all the Coordinate objects
    // obtained from individual partitions
    JavaRDD<Coordinate> calculatedHull = localHull.coalesce(1).cache();

    // Perform Convex hull operation
    JavaRDD<Coordinate> globalHull = calculatedHull.mapPartitions(new hull()).distinct();

    JavaPairRDD<Coordinate, Coordinate> allCoordinateTuples = globalHull.cartesian(globalHull);
    System.out.println("Total cart: " + allCoordinateTuples.collect().size());

    JavaRDD<Pair> pairsRDD =
        allCoordinateTuples.map(
            new Function<Tuple2<Coordinate, Coordinate>, Pair>() {

              public Pair call(Tuple2<Coordinate, Coordinate> tuple) throws Exception {
                // TODO Auto-generated method stub
                Coordinate pointA = tuple._1();
                Coordinate pointB = tuple._2();
                Pair a = new Pair(pointA, pointB);
                return a;
              }
            });

    JavaRDD<Pair> pairs =
        allCoordinateTuples.mapPartitions(
            new FlatMapFunction<Iterator<Tuple2<Coordinate, Coordinate>>, Pair>() {

              /** */
              private static final long serialVersionUID = 1L;

              public Iterable<Pair> call(Iterator<Tuple2<Coordinate, Coordinate>> tuples)
                  throws Exception {
                // TODO Auto-generated method stub
                List<Pair> pairsFromTuples = new ArrayList<Pair>();
                // Pair singlePair = new Pair();
                Tuple2<Coordinate, Coordinate> tuple;
                while (tuples.hasNext()) {
                  tuple = tuples.next();

                  // singlePair.A = tuples.next()._1;
                  // singlePair.B = tuples.next()._2;
                  Pair singlePair = new Pair(tuple._1(), tuple._2());
                  pairsFromTuples.add(singlePair);
                }
                return pairsFromTuples;
              }
            });

    JavaRDD<Integer> x =
        pairsRDD.mapPartitions(
            new FlatMapFunction<Iterator<Pair>, Integer>() {

              public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception {
                // TODO Auto-generated method stub
                ArrayList<Integer> x = new ArrayList<Integer>();
                x.add(1);
                return x;
              }
            });

    System.out.println("Num of partitions: " + x.collect());

    JavaRDD<Integer> y =
        pairs.mapPartitions(
            new FlatMapFunction<Iterator<Pair>, Integer>() {

              public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception {
                // TODO Auto-generated method stub
                ArrayList<Integer> x = new ArrayList<Integer>();
                x.add(1);
                return x;
              }
            });

    System.out.println("Num of partitions charan: " + y.collect());

    Pair minDistPair =
        pairs.reduce(
            new Function2<Pair, Pair, Pair>() {
              /** */
              private static final long serialVersionUID = 1L;

              public Pair call(Pair a, Pair b) throws Exception {
                // TODO Auto-generated method stub

                return (a.distanceLength > b.distanceLength ? a : b);
              }
            });

    // System.out.println(minDistPair);

    Coordinate closestpointA = minDistPair.A;
    Coordinate closestpointB = minDistPair.B;

    List<Coordinate> closestPoints = new ArrayList<Coordinate>();
    closestPoints.add(closestpointA);
    closestPoints.add(closestpointB);

    JavaRDD<Coordinate> closestRDD = sc.parallelize(closestPoints);

    // Map to a tuple to sort the Points
    JavaPairRDD<Coordinate, Boolean> coordinateTupleRDD =
        closestRDD.mapToPair(new CoordinatePairFunction());

    // Sort the points
    JavaPairRDD<Coordinate, Boolean> sortedCoordinateTupleRDD =
        coordinateTupleRDD.sortByKey(new CoordinateComparator());

    // Map to points RDD
    JavaRDD<Coordinate> finalSortedCoordinateRDD =
        sortedCoordinateTupleRDD.map(new TupleToCoordinateMapFunction());

    JavaRDD<String> outputData = finalSortedCoordinateRDD.map(parseOutputData);
    // closestRDD.saveAsTextFile(outputfilepath);
    outputData.saveAsTextFile(args[1]);

    // Output your result, you need to sort your result!!!
    // And,Don't add a additional clean up step delete the new generated
    // file...
    sc.close();
  }
 @Override
 public JavaRDD<String> decompress(String inputFile) throws IOException {
   return sparkContext.textFile(String.format("%s/*", inputFile));
 }
Example #30
0
  public static void main(String[] args) {
    String fileUri = "mnist_pca_lda_train.csv";
    String testFileUri = "mnist_pca_lda_test.csv";
    SparkConf conf = new SparkConf().setAppName("Logit Sample").set("spark.executor.memory", "4g");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> rawTrainData = sc.textFile(fileUri).cache();

    JavaRDD<String> rawTestData = sc.textFile(testFileUri).cache();

    /**
     * *********** Training portion ************************** 1. Parse the lines into
     * LabledPoint<label, features> 2. Run through in a loop for all 45 combinations of features. 3.
     * Filter the RDD for the given pair of labels. 4. Transform the entries into 0 and 1. 5. Run
     * the logit model for every filtered RDDs.
     */
    long startTime = System.currentTimeMillis();

    /** Creating LabledPoints from the input. */
    JavaRDD<LabeledPoint> labledPointsRDD =
        rawTrainData.map(
            new Function<String, LabeledPoint>() {

              public LabeledPoint call(String line) throws Exception {
                return formLabeledPoint(line);
              }

              private LabeledPoint formLabeledPoint(String line) {
                String[] tokens = line.split(",");
                double[] result = new double[tokens.length - 19];
                for (int i = 1; i < tokens.length - 18; i++) {
                  result[i - 1] = Double.valueOf(tokens[i]);
                }
                double label = Double.valueOf(tokens[0]);
                LabeledPoint resultPoint = new LabeledPoint(label, new DenseVector(result));

                return resultPoint;
              }
            });

    LogisticRegressionModel[] model = new LogisticRegressionModel[45];

    System.out.println(" Training iteration starts...");

    int k = 0;
    for (double i = 0; i <= 9; i++) {
      for (double j = 1; j <= 9; j++) {
        if (j > i) {
          System.out.printf("Training for:%f\t%f....Starting..\n", i, j);
          final double label1 = i;
          final double label2 = j;
          System.out.printf("Train: label1:%f\tlabel2:%f\n", label1, label2);
          /** Filtering for lables i and j */
          JavaRDD<LabeledPoint> filteredRDD =
              labledPointsRDD.filter(
                  new Function<LabeledPoint, Boolean>() {

                    public Boolean call(LabeledPoint point) throws Exception {
                      boolean result = false;
                      if (point.label() == label1 || point.label() == label2) result = true;
                      return result;
                    }
                  });

          printKeyValStat(filteredRDD);

          /** Mapping input into binary */
          JavaRDD<LabeledPoint> transformedRDD =
              filteredRDD.map(
                  new Function<LabeledPoint, LabeledPoint>() {

                    public LabeledPoint call(LabeledPoint point) throws Exception {
                      double newLabel = point.label() == label1 ? 0 : 1;
                      LabeledPoint result = new LabeledPoint(newLabel, point.features());

                      return result;
                    }
                  });

          printKeyValStat(transformedRDD);

          model[k] = new LogisticRegressionWithLBFGS().setNumClasses(2).run(transformedRDD.rdd());
        }
      }
    }

    System.out.println(" Training iteration finished...");

    long endTime = System.currentTimeMillis();

    System.out.println(" Model training time: " + (endTime - startTime));

    /** ************ End of training portion **************** */
    startTime = System.currentTimeMillis();

    /**
     * *********** Scoring/Testing portion ******************* 1. Parse the lines into
     * LabledPoint<label, features> 2. Run through in a loop for all 45 combinations of features. 3.
     * Filter the RDD for the given pair of labels. 4. Transform the entries into 0 and 1. 5. Run
     * the prediction and record prediction and labels. 6. Print the accuracy metrics.
     */

    /** Creating LabledPoints from the input. */
    JavaRDD<LabeledPoint> labledTestPointsRDD =
        rawTestData.map(
            new Function<String, LabeledPoint>() {

              public LabeledPoint call(String line) throws Exception {
                return formLabeledPoint(line);
              }

              private LabeledPoint formLabeledPoint(String line) {
                String[] tokens = line.split(",");
                double[] result = new double[tokens.length - 19];
                for (int i = 1; i < tokens.length - 18; i++) {
                  result[i - 1] = Double.valueOf(tokens[i]);
                }
                double label = Double.valueOf(tokens[0]);
                LabeledPoint resultPoint = new LabeledPoint(label, new DenseVector(result));

                return resultPoint;
              }
            });

    k = 0;
    double sumAUC = 0;
    for (double i = 0; i <= 9; i++) {
      for (double j = 1; j <= 9; j++) {
        if (j > i) {
          final double label1 = i;
          final double label2 = j;
          System.out.printf("Testing for:%f\t%f....Starting..\n", i, j);
          /** Filtering for labels i and j */
          JavaRDD<LabeledPoint> filteredRDD =
              labledTestPointsRDD.filter(
                  new Function<LabeledPoint, Boolean>() {

                    public Boolean call(LabeledPoint point) throws Exception {
                      boolean result = false;
                      if (point.label() == label1 || point.label() == label2) result = true;
                      return result;
                    }
                  });

          printKeyValStat(filteredRDD);

          /** Mapping input into binary */
          JavaRDD<LabeledPoint> transformedRDD =
              filteredRDD.map(
                  new Function<LabeledPoint, LabeledPoint>() {

                    public LabeledPoint call(LabeledPoint point) throws Exception {
                      double newLabel = point.label() == label1 ? 0 : 1;
                      LabeledPoint result = new LabeledPoint(newLabel, point.features());

                      return result;
                    }
                  });

          printKeyValStat(transformedRDD);

          final LogisticRegressionModel currentModel = model[k];

          JavaRDD<Tuple2<Object, Object>> predictionAndLabels =
              transformedRDD.map(
                  new Function<LabeledPoint, Tuple2<Object, Object>>() {

                    private static final long serialVersionUID = 1L;

                    public Tuple2<Object, Object> call(LabeledPoint inputPoint) throws Exception {
                      Double prediction = currentModel.predict(inputPoint.features());
                      return new Tuple2<Object, Object>(prediction, inputPoint.label());
                    }
                  });

          predictionAndLabels.saveAsTextFile("predictions/predictions_" + label1 + "_vs_" + label2);

          /*MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
          double precision = metrics.precision();*/

          BinaryClassificationMetrics metrics =
              new BinaryClassificationMetrics(predictionAndLabels.rdd());
          double auROC = metrics.areaUnderROC();

          sumAUC += auROC;

          System.out.printf("AUC for %f, %f is %f \n", i, j, auROC);
        }
      }
    }

    System.out.printf("Average AUC value :: %f \n", (sumAUC / 45));

    endTime = System.currentTimeMillis();

    System.out.println(" Model scoring time.. :" + (endTime - startTime));

    /** ***************** End of scoring/testing portion *************** */
  }