public static void main(String args[]) { SparkConf conf = new SparkConf().setAppName("KeyValueTest").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaRDD<String> lines = jsc.textFile("/home/piyushm/samplejson.json"); List<Person> persons = lines.mapPartitions(new ParseJson()).collect(); JavaRDD<Person> personJavaRDD = jsc.parallelize(persons); JavaRDD<String> csvFileContent = jsc.textFile("/opt/sample.csv"); System.out.println(csvFileContent.map(new ParseLine()).collect()); System.out.println(persons); System.out.println(personJavaRDD.mapPartitions(new WriteJson()).collect()); jsc.stop(); }
public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: KMeansMP <input_file> <results>"); System.exit(1); } String inputFile = args[0]; String results_path = args[1]; JavaPairRDD<Integer, Iterable<String>> results; int k = 4; int iterations = 100; int runs = 1; long seed = 0; final KMeansModel model; SparkConf sparkConf = new SparkConf().setAppName("KMeans MP"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> lines = sc.textFile(inputFile); JavaRDD<Vector> points = lines.map(new ParsePoint()); JavaRDD<String> titles = lines.map(new ParseTitle()); model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.RANDOM(), 0); results = titles.zip(points).mapToPair(new ClusterCars(model)).groupByKey(); results.saveAsTextFile(results_path); sc.stop(); }
public static void main(String[] args) { String logFile; if (args.length != 0) logFile = args[0]; else logFile = "/media/gf/Java/spark-1.4.0-bin-hadoop2.6/README.md"; final SparkConf conf = new SparkConf().setAppName("Simple Application"); final JavaSparkContext sc = new JavaSparkContext(conf); final JavaRDD<String> logData = sc.textFile(logFile).cache(); final String[] check = getFilterSet(); System.out.println("Start: " + new Date()); for (int i = 0; i < check.length; i++) { final int post = i; long count = logData .filter( new Function<String, Boolean>() { public Boolean call(String s) { return s.contains(check[post]); } }) .count(); System.out.println("Lines with " + check[i] + ": " + count); } System.out.println("End: " + new Date()); sc.close(); }
@SuppressWarnings("serial") @Override public SortedCounts<String> execute(final JavaSparkContext spark) { final JavaRDD<String> textFile = spark.textFile(inputFile); final JavaRDD<String> words = textFile.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(final String rawJSON) throws TwitterException { final Status tweet = TwitterObjectFactory.createStatus(rawJSON); String text = tweet.getText(); return Arrays.asList(text.split(" ")); } }); final JavaPairRDD<String, Integer> pairs = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(final String s) { return new Tuple2<String, Integer>(s.toLowerCase(), 1); } }); final JavaPairRDD<String, Integer> counts = pairs.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(final Integer a, final Integer b) { return a + b; } }); return SortedCounts.create(counts); }
public static void main(String[] args) { // Create a java spark context SparkConf conf = new SparkConf().setAppName("Accumulators"); JavaSparkContext sc = new JavaSparkContext(conf); // Create an accumulator to keep track of number of blank lines in callSigns.txt final Accumulator<Integer> blankLines = sc.accumulator(0); JavaRDD<String> input = sc.textFile("src/main/resources/callSigns.txt"); JavaRDD<String> callSigns = input.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { if (s.equals("")) { blankLines.add(1); } return Arrays.asList(s.split(" ")); } }); callSigns.saveAsTextFile("Chapter5-Output"); System.out.println("Number of blank lines present in text file : " + blankLines); }
public static void main(String[] args) { String logFile = "YOUR_SPARK_HOME/README.md"; // Should be some file on your system SparkConf conf = new SparkConf().setAppName("Simple Application"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> logData = sc.textFile(logFile).cache(); long numAs = logData .filter( new Function<String, Boolean>() { public Boolean call(String s) { return s.contains("a"); } }) .count(); long numBs = logData .filter( new Function<String, Boolean>() { public Boolean call(String s) { return s.contains("b"); } }) .count(); System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); }
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaLogQuery"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs); JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair( new PairFunction<String, Tuple3<String, String, String>, Stats>() { @Override public Tuple2<Tuple3<String, String, String>, Stats> call(String s) { return new Tuple2<Tuple3<String, String, String>, Stats>( extractKey(s), extractStats(s)); } }); JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey( new Function2<Stats, Stats, Stats>() { @Override public Stats call(Stats stats, Stats stats2) { return stats.merge(stats2); } }); List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect(); for (Tuple2<?, ?> t : output) { System.out.println(t._1() + "\t" + t._2()); } jsc.stop(); }
public static void main(String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(javaSparkContext); System.out.println("=== Data source: RDD ==="); // Load a text file and convert each line to a Java Bean. JavaRDD<Person> people = javaSparkContext .textFile("people.txt") .map( (line) -> { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]); person.setAge(parts[1]); return person; }); // Apply a schema to an RDD of Java Beans and register it as a table. DataFrame dataFrame = sqlContext.createDataFrame(people, Person.class); dataFrame.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // 通过DataFrame 获取对应的RDD collection 获取对应的结果内容 List<String> teenagersName = teenagers.toJavaRDD().map((row) -> "Name : " + row.getString(0)).collect(); teenagersName.forEach( (name) -> { System.out.println(name); }); }
public static void main(String[] args) { String master = args[0]; String appName = args[1]; String path = args[2]; SparkConf conf = new SparkConf().setAppName(appName).setMaster(master); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(path) .filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return !s.isEmpty() && !s.contains("Total"); } }); JavaRDD<String> usOnly = lines.filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return s.contains("United States"); } }); JavaPairRDD<String, Integer> yearAndMedals = usOnly.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { String[] fields = s.split(","); return new Tuple2<String, Integer>(fields[3], Integer.parseInt(fields[8])); } }); JavaPairRDD<String, Integer> reduced = yearAndMedals.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer accumulator, Integer currentValue) throws Exception { return accumulator + currentValue; } }); JavaPairRDD<String, Integer> result = reduced.filter( new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> tuple) throws Exception { return tuple._2 < 200; } }); System.out.println(); System.out.println(result.collect()); }
public static void main(String[] args) { if (args.length == 0) { System.err.println("Usage: Main <file>"); System.exit(1); } SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(args[0]); JavaPairRDD<String, Double> dayArrivalDelayPair = lines.flatMapToPair( line -> { String[] splitLine = line.split(SPLIT_PATTERN); String key = splitLine.length == 0 ? "" : splitLine[0]; Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]); return Arrays.asList(new Tuple2<>(key, value)); }); JavaPairRDD<String, AverageWrapper> dayAverageWrapper = dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1)); JavaPairRDD<String, AverageWrapper> daysValueCount = dayAverageWrapper.reduceByKey( (aw1, aw2) -> new AverageWrapper( aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount())); Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap(); List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>(); listResults.addAll(resultMap.entrySet()); Collections.sort( listResults, (entry1, entry2) -> Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue())); for (Map.Entry<String, AverageWrapper> entry : listResults) { System.out.printf( "%s -> (%f, %d)\n", entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount()); } // JavaPairRDD<String, Double> resultRDD = // daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() / // averageWrapper.getCount()); // // Map<String, Double> results = resultRDD.collectAsMap(); // List<Map.Entry<String, Double>> listResults = new ArrayList<>(); // listResults.addAll(results.entrySet()); // Collections.sort(listResults, (entry1, entry2) -> // entry1.getValue().compareTo(entry2.getValue())); // // for (Map.Entry<String, Double> entry : listResults) { // System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue()); // } }
public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext("local", "JavaAPISuite"); JavaRDD<String> lines = sc.textFile("log.txt"); JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" "))); JavaPairRDD<String, Integer> counts = words.mapToPair(w -> new Tuple2<String, Integer>(w, 1)).reduceByKey((x, y) -> x + y); counts.collect().forEach(t -> System.out.println("Key:" + t._1() + " Value:" + t._2())); }
/** Load the data from the json file and return an RDD of Tweet */ public JavaRDD<Tweet> loadData() { // create spark configuration and spark context SparkConf conf = new SparkConf().setAppName("Tweet mining").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(line -> Parse.parseJsonToTweet(line)); return tweets; }
public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: NaiveBayesExample <training_data> <test_data>"); System.exit(1); } String training_data_path = args[0]; // https://class.coursera.org/cloudapplications-001/forum/thread?thread_id=1387 // String test_data_path = args[0]; String test_data_path = args[1]; SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesExample"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<LabeledPoint> train = sc.textFile(training_data_path).map(new DataToPoint()); // JavaRDD<LabeledPoint> test = sc.textFile(training_data_path).map(new DataToPoint()); JavaRDD<LabeledPoint> test = sc.textFile(test_data_path).map(new DataToPoint()); final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0); JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair( new PairFunction<LabeledPoint, Double, Double>() { public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<Double, Double>(model.predict(p.features()), p.label()); } }); double accuracy = predictionAndLabel .filter( new Function<Tuple2<Double, Double>, Boolean>() { public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); } }) .count() / (double) test.count(); System.out.println(accuracy); sc.stop(); }
public static void main(String[] args) { // Handle invalid arguments.. if (args.length < 2) { System.out.println("Usage: ConvexHull arg1 arg2"); System.out.println("arg1: input dataset A file path [points]"); System.out.println("arg2: output file name and path"); System.exit(1); } // Creating and setting sparkconf SparkConf sparkConf = new SparkConf().setAppName("Group3-edu.asu.cse512.ConvexHull"); JavaSparkContext sc = new JavaSparkContext(sparkConf); // Adding external jars // sc.addJar("lib/jts-1.13.jar"); JavaRDD<String> lines = sc.textFile(args[0]); // Using mapPartitions function to find convex hull points in distributed environment JavaRDD<Coordinate> hullPointsRDD = lines.mapPartitions(new ConvexH()); List<Coordinate> hullPointsList = hullPointsRDD.collect(); Coordinate[] inputArray = new Coordinate[hullPointsList.size()]; int j = 0; for (Coordinate c : hullPointsList) { inputArray[j] = c; j++; } // Finding convex hull points on the final subset of points retrieved from distributed // environment GeometryFactory geoFactory1 = new GeometryFactory(); MultiPoint mPoint1 = geoFactory1.createMultiPoint(inputArray); Geometry geo1 = mPoint1.convexHull(); Coordinate[] convexHullResult = geo1.getCoordinates(); int length = convexHullResult.length; Coordinate[] convexHullFinalResult = Arrays.copyOf(convexHullResult, length - 1); Arrays.sort(convexHullFinalResult); // Converting the list of coordinates into Coordinate RDD JavaRDD<Coordinate> convexHullResultRDD = sc.parallelize(Arrays.asList(convexHullFinalResult), 1); JavaRDD<String> convexHullResultString = convexHullResultRDD .repartition(1) .map( new Function<Coordinate, String>() { public String call(Coordinate hullPoint) throws Exception { return hullPoint.x + "," + hullPoint.y; } }); // Save the String RDD into text file. Using repartition(1) to preserve the order of coordinates convexHullResultString.repartition(1).saveAsTextFile(args[1]); }
public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext("local", "test-java-client"); // JavaSparkContext sc = new JavaSparkContext("spark://192.168.181.23:7077", // "test-java-client"); JavaRDD<String> textFile = sc.textFile("/home/congsl/apps/storm/dockerfile-repository/nginx/Dockerfile"); Map<String, Integer> result = textFile .flatMap( new FlatMapFunction<String, Object>() { @Override public Iterable<Object> call(String s) throws Exception { System.out.println(s); return Arrays.asList(s.split(" ")); } }) .map( new Function<Object, Map<String, Integer>>() { @Override public Map<String, Integer> call(Object v1) throws Exception { System.out.println(v1); Map<String, Integer> map = new HashMap<String, Integer>(); map.put(v1.toString(), 1); return map; } }) .reduce( new Function2<Map<String, Integer>, Map<String, Integer>, Map<String, Integer>>() { @Override public Map<String, Integer> call(Map<String, Integer> v1, Map<String, Integer> v2) throws Exception { System.out.println("v1:" + v1); System.out.println("v2:" + v2); for (String key : v2.keySet()) { if (v1.get(key) == null) { v1.put(key, v2.get(key)); } else { v1.put(key, v1.get(key) + v2.get(key)); } } return v1; } }); System.out.println(result); System.out.println(textFile.count()); SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); DataFrame df = sqlContext.jdbc( "jdbc:mysql://localhost:3306/activiti?user=root&password=admin", "ACT_ID_INFO"); df.show(); // JavaSQLContext sqlContext = new JavaSQLContext(sparkContext); }
public static void main(String[] args) { String file = ""; SparkConf conf = new SparkConf().setAppName("app"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(file); JavaRDD<String> errors = lines.filter( new Function<String, Boolean>() { public Boolean call(String x) { return x.contains("error"); } }); }
/** * This method will read a file into a JavaRDD * * @param fileLocation * @param headerRowSkippingCriteria * @return */ @SuppressWarnings("serial") private static JavaRDD<String> readData( String fileLocation, final String headerRowSkippingCriteria) { JavaRDD<String> lines = null; if (headerRowSkippingCriteria == null) { lines = sc.textFile(fileLocation); } else { lines = sc.textFile(fileLocation) .filter( new Function<String, Boolean>() { public Boolean call(String line) { if (line.contains(headerRowSkippingCriteria)) { System.out.println(line); return false; } else return !(line.contains(headerRowSkippingCriteria)); } }); } return lines; }
public static JavaRDD<String> getJavaRDD(JavaSparkContext sparkContext) { System.out.println("Converting" + sparkContext.version() + sparkContext.appName()); JavaRDD<String> testJRDD = null; try { testJRDD = sparkContext.textFile( "/Users/shawnkyzer/Documents/aleph2_analytic_services_R/hs_err_pid2930.log"); } catch (Exception e) { System.out.println(e.fillInStackTrace()); } System.out.println("Converting"); return testJRDD; }
public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext(); Configuration conf = sc.hadoopConfiguration(); conf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem"); conf.set("fs.swift.service.test.auth.url", "http://163.17.136.246:5000/v2.0/tokens"); conf.set("fs.swift.service.test.auth.endpoint.prefix", "endpoints"); conf.set("fs.swift.service.test.http.port", "8080"); conf.set("fs.swift.service.test.region", "RegionOne"); conf.set("fs.swift.service.test.public", "true"); conf.set("fs.swift.service.test.tenant", "big-data"); conf.set("fs.swift.service.test.username", "k753357"); conf.set("fs.swift.service.test.password", "k753357"); JavaRDD<String> rawRDD = sc.textFile(args[0]); rawRDD.saveAsTextFile("swift://testfile.test/file/"); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaWordCount <master> <file>"); System.exit(1); } JavaSparkContext ctx = new JavaSparkContext( args[0], "JavaWordCount", System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaWordCount.class)); JavaRDD<String> lines = ctx.textFile(args[1], 1); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1 + ": " + tuple._2); } System.exit(0); }
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: JavaWordCount <input_file> <output_file>"); System.exit(1); } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) { return Arrays.asList(SPACE.split(s)).iterator(); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); /* List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?,?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } */ counts.saveAsTextFile(args[1]); ctx.stop(); }
private static RDD<Tuple2<Integer, double[]>> readFeaturesRDD( JavaSparkContext sparkContext, Path path) { log.info("Loading features RDD from {}", path); JavaRDD<String> featureLines = sparkContext.textFile(path.toString()); return featureLines .map( new Function<String, Tuple2<Integer, double[]>>() { @Override public Tuple2<Integer, double[]> call(String line) throws IOException { List<?> update = MAPPER.readValue(line, List.class); Integer key = Integer.valueOf(update.get(0).toString()); double[] vector = MAPPER.convertValue(update.get(1), double[].class); return new Tuple2<>(key, vector); } }) .rdd(); }
public static void main(String[] args) throws Exception { Schema schema = new Schema.Builder() .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width") .addColumnInteger("Species") .build(); SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); String directory = new ClassPathResource("IrisData/iris.txt") .getFile() .getParent(); // Normally just define your directory like "file:/..." or "hdfs:/..." JavaRDD<String> stringData = sc.textFile(directory); // We first need to parse this comma-delimited (CSV) format; we can do this using // CSVRecordReader: RecordReader rr = new CSVRecordReader(); JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr)); int maxHistogramBuckets = 10; DataAnalysis dataAnalysis = AnalyzeSpark.analyze(schema, parsedInputData, maxHistogramBuckets); System.out.println(dataAnalysis); // We can get statistics on a per-column basis: DoubleAnalysis da = (DoubleAnalysis) dataAnalysis.getColumnAnalysis("Sepal length"); double minValue = da.getMin(); double maxValue = da.getMax(); double mean = da.getMean(); HtmlAnalysis.createHtmlAnalysisFile(dataAnalysis, new File("DataVecIrisAnalysis.html")); // To write to HDFS instead: // String htmlAnalysisFileContents = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis); // SparkUtils.writeStringToFile("hdfs://your/hdfs/path/here",htmlAnalysisFileContents,sc); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local").setAppName("My App"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("src/main/resources/data.txt"); @SuppressWarnings("serial") JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + "-> " + tuple._2()); } sc.close(); }
public static void wordCountJava8(String filename) { // Define a configuration to use to interact with Spark SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App"); // Create a Java version of the Spark Context from the configuration JavaSparkContext sc = new JavaSparkContext(conf); // Load the input data, which is a text file read from the command line JavaRDD<String> input = sc.textFile(filename); // Java 8 with lambdas: split the input string into words JavaRDD<String> words = input.flatMap(s -> Arrays.asList(s.split(" "))); // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count // them JavaPairRDD<String, Integer> counts = words.mapToPair(t -> new Tuple2(t, 1)).reduceByKey((x, y) -> (int) x + (int) y); // Save the word count back out to a text file, causing evaluation. counts.saveAsTextFile("output"); }
private DataFrame artistsAsDataFrame() { String input = TestUtils.sampleArtistsDat(); JavaRDD<String> data = sc.textFile(input); StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("id", DataTypes.IntegerType, false), DataTypes.createStructField("name", DataTypes.StringType, false), DataTypes.createStructField("url", DataTypes.StringType, true), DataTypes.createStructField("pictures", DataTypes.StringType, true), DataTypes.createStructField("time", DataTypes.TimestampType, true) }); JavaRDD<Row> rowData = data.map( new Function<String, String[]>() { @Override public String[] call(String line) throws Exception { return line.split("\t"); } }) .map( new Function<String[], Row>() { @Override public Row call(String[] r) throws Exception { return RowFactory.create( Integer.parseInt(r[0]), r[1], r[2], r[3], new Timestamp(DatatypeConverter.parseDateTime(r[4]).getTimeInMillis())); } }); return sqc.createDataFrame(rowData, schema); }
public static void main(String args[]) { SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); conf.set("es.index.auto.create", "true"); JavaSparkContext context = new JavaSparkContext(conf); JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv"); JavaRDD<Crime> dataSplits = textFile.map( line -> { CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180); Crime c = new Crime(); CSVRecord record = parser.getRecords().get(0); c.setId(record.get(0)); c.setCaseNumber(record.get(1)); c.setEventDate(record.get(2)); c.setBlock(record.get(3)); c.setIucr(record.get(4)); c.setPrimaryType(record.get(5)); c.setDescription(record.get(6)); c.setLocation(record.get(7)); c.setArrest(Boolean.parseBoolean(record.get(8))); c.setDomestic(Boolean.parseBoolean(record.get(9))); String lat = record.get(10); String lon = record.get(11); Map<String, Double> geoLocation = new HashMap<>(); geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat)); geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon)); c.setGeoLocation(geoLocation); return c; }); SQLContext sqlContext = new SQLContext(context); DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class); JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection"); }
public static void main(String[] args) throws FileNotFoundException { if (args.length <= 0) { System.out.println( "We require input file path, output file path and number of partitions argument to proceed further."); System.out.println( "Usage: java FarthestPair <input file path> <output file path> <noOfPartitions>"); System.exit(0); } String inputFile = args[0]; SparkConf conf = new SparkConf().setAppName("Group6-FarthestPair"); JavaSparkContext sc = new JavaSparkContext(conf); // Read file as RDD JavaRDD<String> inputData = sc.textFile(inputFile); // JavaRDD<Coordinate> coordinates = inputData.mapPartitions(parseData); // Map each String in the file as a coordinate object JavaRDD<Coordinate> coordinates = inputData.map(parseData); // .repartition(noOfPartitions); // Map to a tuple to sort the Points JavaPairRDD<Coordinate, Boolean> pointTupleRDD = coordinates.mapToPair(new CoordinatePairFunction()); // Sort the points JavaPairRDD<Coordinate, Boolean> sortedPointTupleRDD = pointTupleRDD.sortByKey(new CoordinateComparator()); // Map to points RDD JavaRDD<Coordinate> finalSortedPointRDD = sortedPointTupleRDD.map(new TupleToCoordinateMapFunction()); // Convert sorted collection to RDD // Perform Convex hull operation on individual partition JavaRDD<Coordinate> localHull = finalSortedPointRDD.mapPartitions(new hull()); // Repartition to 1 partition in order to apply 'convex hull' on all the Coordinate objects // obtained from individual partitions JavaRDD<Coordinate> calculatedHull = localHull.coalesce(1).cache(); // Perform Convex hull operation JavaRDD<Coordinate> globalHull = calculatedHull.mapPartitions(new hull()).distinct(); JavaPairRDD<Coordinate, Coordinate> allCoordinateTuples = globalHull.cartesian(globalHull); System.out.println("Total cart: " + allCoordinateTuples.collect().size()); JavaRDD<Pair> pairsRDD = allCoordinateTuples.map( new Function<Tuple2<Coordinate, Coordinate>, Pair>() { public Pair call(Tuple2<Coordinate, Coordinate> tuple) throws Exception { // TODO Auto-generated method stub Coordinate pointA = tuple._1(); Coordinate pointB = tuple._2(); Pair a = new Pair(pointA, pointB); return a; } }); JavaRDD<Pair> pairs = allCoordinateTuples.mapPartitions( new FlatMapFunction<Iterator<Tuple2<Coordinate, Coordinate>>, Pair>() { /** */ private static final long serialVersionUID = 1L; public Iterable<Pair> call(Iterator<Tuple2<Coordinate, Coordinate>> tuples) throws Exception { // TODO Auto-generated method stub List<Pair> pairsFromTuples = new ArrayList<Pair>(); // Pair singlePair = new Pair(); Tuple2<Coordinate, Coordinate> tuple; while (tuples.hasNext()) { tuple = tuples.next(); // singlePair.A = tuples.next()._1; // singlePair.B = tuples.next()._2; Pair singlePair = new Pair(tuple._1(), tuple._2()); pairsFromTuples.add(singlePair); } return pairsFromTuples; } }); JavaRDD<Integer> x = pairsRDD.mapPartitions( new FlatMapFunction<Iterator<Pair>, Integer>() { public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception { // TODO Auto-generated method stub ArrayList<Integer> x = new ArrayList<Integer>(); x.add(1); return x; } }); System.out.println("Num of partitions: " + x.collect()); JavaRDD<Integer> y = pairs.mapPartitions( new FlatMapFunction<Iterator<Pair>, Integer>() { public Iterable<Integer> call(Iterator<Pair> arg0) throws Exception { // TODO Auto-generated method stub ArrayList<Integer> x = new ArrayList<Integer>(); x.add(1); return x; } }); System.out.println("Num of partitions charan: " + y.collect()); Pair minDistPair = pairs.reduce( new Function2<Pair, Pair, Pair>() { /** */ private static final long serialVersionUID = 1L; public Pair call(Pair a, Pair b) throws Exception { // TODO Auto-generated method stub return (a.distanceLength > b.distanceLength ? a : b); } }); // System.out.println(minDistPair); Coordinate closestpointA = minDistPair.A; Coordinate closestpointB = minDistPair.B; List<Coordinate> closestPoints = new ArrayList<Coordinate>(); closestPoints.add(closestpointA); closestPoints.add(closestpointB); JavaRDD<Coordinate> closestRDD = sc.parallelize(closestPoints); // Map to a tuple to sort the Points JavaPairRDD<Coordinate, Boolean> coordinateTupleRDD = closestRDD.mapToPair(new CoordinatePairFunction()); // Sort the points JavaPairRDD<Coordinate, Boolean> sortedCoordinateTupleRDD = coordinateTupleRDD.sortByKey(new CoordinateComparator()); // Map to points RDD JavaRDD<Coordinate> finalSortedCoordinateRDD = sortedCoordinateTupleRDD.map(new TupleToCoordinateMapFunction()); JavaRDD<String> outputData = finalSortedCoordinateRDD.map(parseOutputData); // closestRDD.saveAsTextFile(outputfilepath); outputData.saveAsTextFile(args[1]); // Output your result, you need to sort your result!!! // And,Don't add a additional clean up step delete the new generated // file... sc.close(); }
@Override public JavaRDD<String> decompress(String inputFile) throws IOException { return sparkContext.textFile(String.format("%s/*", inputFile)); }
public static void main(String[] args) { String fileUri = "mnist_pca_lda_train.csv"; String testFileUri = "mnist_pca_lda_test.csv"; SparkConf conf = new SparkConf().setAppName("Logit Sample").set("spark.executor.memory", "4g"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> rawTrainData = sc.textFile(fileUri).cache(); JavaRDD<String> rawTestData = sc.textFile(testFileUri).cache(); /** * *********** Training portion ************************** 1. Parse the lines into * LabledPoint<label, features> 2. Run through in a loop for all 45 combinations of features. 3. * Filter the RDD for the given pair of labels. 4. Transform the entries into 0 and 1. 5. Run * the logit model for every filtered RDDs. */ long startTime = System.currentTimeMillis(); /** Creating LabledPoints from the input. */ JavaRDD<LabeledPoint> labledPointsRDD = rawTrainData.map( new Function<String, LabeledPoint>() { public LabeledPoint call(String line) throws Exception { return formLabeledPoint(line); } private LabeledPoint formLabeledPoint(String line) { String[] tokens = line.split(","); double[] result = new double[tokens.length - 19]; for (int i = 1; i < tokens.length - 18; i++) { result[i - 1] = Double.valueOf(tokens[i]); } double label = Double.valueOf(tokens[0]); LabeledPoint resultPoint = new LabeledPoint(label, new DenseVector(result)); return resultPoint; } }); LogisticRegressionModel[] model = new LogisticRegressionModel[45]; System.out.println(" Training iteration starts..."); int k = 0; for (double i = 0; i <= 9; i++) { for (double j = 1; j <= 9; j++) { if (j > i) { System.out.printf("Training for:%f\t%f....Starting..\n", i, j); final double label1 = i; final double label2 = j; System.out.printf("Train: label1:%f\tlabel2:%f\n", label1, label2); /** Filtering for lables i and j */ JavaRDD<LabeledPoint> filteredRDD = labledPointsRDD.filter( new Function<LabeledPoint, Boolean>() { public Boolean call(LabeledPoint point) throws Exception { boolean result = false; if (point.label() == label1 || point.label() == label2) result = true; return result; } }); printKeyValStat(filteredRDD); /** Mapping input into binary */ JavaRDD<LabeledPoint> transformedRDD = filteredRDD.map( new Function<LabeledPoint, LabeledPoint>() { public LabeledPoint call(LabeledPoint point) throws Exception { double newLabel = point.label() == label1 ? 0 : 1; LabeledPoint result = new LabeledPoint(newLabel, point.features()); return result; } }); printKeyValStat(transformedRDD); model[k] = new LogisticRegressionWithLBFGS().setNumClasses(2).run(transformedRDD.rdd()); } } } System.out.println(" Training iteration finished..."); long endTime = System.currentTimeMillis(); System.out.println(" Model training time: " + (endTime - startTime)); /** ************ End of training portion **************** */ startTime = System.currentTimeMillis(); /** * *********** Scoring/Testing portion ******************* 1. Parse the lines into * LabledPoint<label, features> 2. Run through in a loop for all 45 combinations of features. 3. * Filter the RDD for the given pair of labels. 4. Transform the entries into 0 and 1. 5. Run * the prediction and record prediction and labels. 6. Print the accuracy metrics. */ /** Creating LabledPoints from the input. */ JavaRDD<LabeledPoint> labledTestPointsRDD = rawTestData.map( new Function<String, LabeledPoint>() { public LabeledPoint call(String line) throws Exception { return formLabeledPoint(line); } private LabeledPoint formLabeledPoint(String line) { String[] tokens = line.split(","); double[] result = new double[tokens.length - 19]; for (int i = 1; i < tokens.length - 18; i++) { result[i - 1] = Double.valueOf(tokens[i]); } double label = Double.valueOf(tokens[0]); LabeledPoint resultPoint = new LabeledPoint(label, new DenseVector(result)); return resultPoint; } }); k = 0; double sumAUC = 0; for (double i = 0; i <= 9; i++) { for (double j = 1; j <= 9; j++) { if (j > i) { final double label1 = i; final double label2 = j; System.out.printf("Testing for:%f\t%f....Starting..\n", i, j); /** Filtering for labels i and j */ JavaRDD<LabeledPoint> filteredRDD = labledTestPointsRDD.filter( new Function<LabeledPoint, Boolean>() { public Boolean call(LabeledPoint point) throws Exception { boolean result = false; if (point.label() == label1 || point.label() == label2) result = true; return result; } }); printKeyValStat(filteredRDD); /** Mapping input into binary */ JavaRDD<LabeledPoint> transformedRDD = filteredRDD.map( new Function<LabeledPoint, LabeledPoint>() { public LabeledPoint call(LabeledPoint point) throws Exception { double newLabel = point.label() == label1 ? 0 : 1; LabeledPoint result = new LabeledPoint(newLabel, point.features()); return result; } }); printKeyValStat(transformedRDD); final LogisticRegressionModel currentModel = model[k]; JavaRDD<Tuple2<Object, Object>> predictionAndLabels = transformedRDD.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { private static final long serialVersionUID = 1L; public Tuple2<Object, Object> call(LabeledPoint inputPoint) throws Exception { Double prediction = currentModel.predict(inputPoint.features()); return new Tuple2<Object, Object>(prediction, inputPoint.label()); } }); predictionAndLabels.saveAsTextFile("predictions/predictions_" + label1 + "_vs_" + label2); /*MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); double precision = metrics.precision();*/ BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd()); double auROC = metrics.areaUnderROC(); sumAUC += auROC; System.out.printf("AUC for %f, %f is %f \n", i, j, auROC); } } } System.out.printf("Average AUC value :: %f \n", (sumAUC / 45)); endTime = System.currentTimeMillis(); System.out.println(" Model scoring time.. :" + (endTime - startTime)); /** ***************** End of scoring/testing portion *************** */ }