@After public void tearDown() throws Exception { if (sc != null) { sc.stop(); sc.close(); } }
public static void trainModel(String filteredDataPath, String modelpath) throws IOException { String line = ""; String combline = ""; // read and process raw data BufferedReader br = new BufferedReader(new FileReader(filteredDataPath)); while ((line = br.readLine()) != null) combline = combline + " " + line; List<String> words = Lists.newArrayList(combline.split(" ")); List<List<String>> localDoc = Lists.newArrayList(words, words); // build a context object JavaSparkContext sc = new JavaSparkContext("local", "Word2VecSuite"); JavaRDD<List<String>> doc = sc.parallelize(localDoc); // training settings Word2Vec word2vec = new Word2Vec().setVectorSize(100).setMinCount(50).setSeed(42L); // train Word2VecModel model = word2vec.fit(doc); // save model SparkContext sc1 = sc.toSparkContext(sc); model.save(sc1, modelpath); System.out.println("Model has been saved in folder: " + modelpath); }
public static void main(String[] args) { String logFile; if (args.length != 0) logFile = args[0]; else logFile = "/media/gf/Java/spark-1.4.0-bin-hadoop2.6/README.md"; final SparkConf conf = new SparkConf().setAppName("Simple Application"); final JavaSparkContext sc = new JavaSparkContext(conf); final JavaRDD<String> logData = sc.textFile(logFile).cache(); final String[] check = getFilterSet(); System.out.println("Start: " + new Date()); for (int i = 0; i < check.length; i++) { final int post = i; long count = logData .filter( new Function<String, Boolean>() { public Boolean call(String s) { return s.contains(check[post]); } }) .count(); System.out.println("Lines with " + check[i] + ": " + count); } System.out.println("End: " + new Date()); sc.close(); }
public static void main(String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(javaSparkContext); System.out.println("=== Data source: RDD ==="); // Load a text file and convert each line to a Java Bean. JavaRDD<Person> people = javaSparkContext .textFile("people.txt") .map( (line) -> { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]); person.setAge(parts[1]); return person; }); // Apply a schema to an RDD of Java Beans and register it as a table. DataFrame dataFrame = sqlContext.createDataFrame(people, Person.class); dataFrame.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); // 通过DataFrame 获取对应的RDD collection 获取对应的结果内容 List<String> teenagersName = teenagers.toJavaRDD().map((row) -> "Name : " + row.getString(0)).collect(); teenagersName.forEach( (name) -> { System.out.println(name); }); }
public static void main(String[] args) { // Create a java spark context SparkConf conf = new SparkConf().setAppName("Accumulators"); JavaSparkContext sc = new JavaSparkContext(conf); // Create an accumulator to keep track of number of blank lines in callSigns.txt final Accumulator<Integer> blankLines = sc.accumulator(0); JavaRDD<String> input = sc.textFile("src/main/resources/callSigns.txt"); JavaRDD<String> callSigns = input.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { if (s.equals("")) { blankLines.add(1); } return Arrays.asList(s.split(" ")); } }); callSigns.saveAsTextFile("Chapter5-Output"); System.out.println("Number of blank lines present in text file : " + blankLines); }
public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: KMeansMP <input_file> <results>"); System.exit(1); } String inputFile = args[0]; String results_path = args[1]; JavaPairRDD<Integer, Iterable<String>> results; int k = 4; int iterations = 100; int runs = 1; long seed = 0; final KMeansModel model; SparkConf sparkConf = new SparkConf().setAppName("KMeans MP"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> lines = sc.textFile(inputFile); JavaRDD<Vector> points = lines.map(new ParsePoint()); JavaRDD<String> titles = lines.map(new ParseTitle()); model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.RANDOM(), 0); results = titles.zip(points).mapToPair(new ClusterCars(model)).groupByKey(); results.saveAsTextFile(results_path); sc.stop(); }
public static void main(String args[]) { if (args.length == 0) { System.out.println("JavaHBaseDistributedScan {master} {tableName}"); } String master = args[0]; String tableName = args[1]; JavaSparkContext jsc = new JavaSparkContext(master, "JavaHBaseDistributedScan"); jsc.addJar("SparkHBase.jar"); Configuration conf = HBaseConfiguration.create(); conf.addResource(new Path("/etc/hbase/conf/core-site.xml")); conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml")); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); Scan scan = new Scan(); scan.setCaching(100); JavaRDD<Tuple2<byte[], List<Tuple3<byte[], byte[], byte[]>>>> javaRdd = hbaseContext.hbaseRDD(tableName, scan); List<Tuple2<byte[], List<Tuple3<byte[], byte[], byte[]>>>> results = javaRdd.collect(); results.size(); }
public void run(String master) { JavaSparkContext sc = new JavaSparkContext( master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); FlatMapFunction<Iterator<Integer>, AvgCount> setup = new FlatMapFunction<Iterator<Integer>, AvgCount>() { @Override public Iterable<AvgCount> call(Iterator<Integer> input) { AvgCount a = new AvgCount(0, 0); while (input.hasNext()) { a.total_ += input.next(); a.num_ += 1; } ArrayList<AvgCount> ret = new ArrayList<AvgCount>(); ret.add(a); return ret; } }; Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() { @Override public AvgCount call(AvgCount a, AvgCount b) { a.total_ += b.total_; a.num_ += b.num_; return a; } }; AvgCount result = rdd.mapPartitions(setup).reduce(combine); System.out.println(result.avg()); }
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaLogQuery"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs); JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair( new PairFunction<String, Tuple3<String, String, String>, Stats>() { @Override public Tuple2<Tuple3<String, String, String>, Stats> call(String s) { return new Tuple2<Tuple3<String, String, String>, Stats>( extractKey(s), extractStats(s)); } }); JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey( new Function2<Stats, Stats, Stats>() { @Override public Stats call(Stats stats, Stats stats2) { return stats.merge(stats2); } }); List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect(); for (Tuple2<?, ?> t : output) { System.out.println(t._1() + "\t" + t._2()); } jsc.stop(); }
public static void main(String args[]) { if (args.length == 0) { System.out.println("JavaHBaseBulkPutExample {master} {tableName} {columnFamily}"); } String master = args[0]; String tableName = args[1]; String columnFamily = args[2]; JavaSparkContext jsc = new JavaSparkContext(master, "JavaHBaseBulkPutExample"); List<String> list = new ArrayList<String>(); list.add("1," + columnFamily + ",a,1"); list.add("2," + columnFamily + ",a,2"); list.add("3," + columnFamily + ",a,3"); list.add("4," + columnFamily + ",a,4"); list.add("5," + columnFamily + ",a,5"); JavaRDD<String> rdd = jsc.parallelize(list); Configuration conf = HBaseConfiguration.create(); conf.addResource(new Path("/opt/hadoop-2.6.0/etc/hadoop/core-site.xml")); conf.addResource(new Path("/opt/hbase/conf/hbase-site.xml")); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.bulkPut(rdd, tableName, new PutFunction(), true); }
public SparkRuntime( SparkPipeline pipeline, JavaSparkContext sparkContext, Configuration conf, Map<PCollectionImpl<?>, Set<Target>> outputTargets, Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize, Map<PCollection<?>, StorageLevel> toCache, Map<PipelineCallable<?>, Set<Target>> allPipelineCallables) { this.pipeline = pipeline; this.sparkContext = sparkContext; this.conf = conf; this.counters = sparkContext.accumulator( Maps.<String, Map<String, Long>>newHashMap(), new CounterAccumulatorParam()); this.ctxt = new SparkRuntimeContext( sparkContext.appName(), counters, sparkContext.broadcast(WritableUtils.toByteArray(conf))); this.outputTargets = Maps.newTreeMap(DEPTH_COMPARATOR); this.outputTargets.putAll(outputTargets); this.toMaterialize = toMaterialize; this.toCache = toCache; this.allPipelineCallables = allPipelineCallables; this.activePipelineCallables = allPipelineCallables.keySet(); this.status.set(Status.READY); this.monitorThread = new Thread( new Runnable() { @Override public void run() { monitorLoop(); } }); }
public static void main(String[] args) { String logFile = "YOUR_SPARK_HOME/README.md"; // Should be some file on your system SparkConf conf = new SparkConf().setAppName("Simple Application"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> logData = sc.textFile(logFile).cache(); long numAs = logData .filter( new Function<String, Boolean>() { public Boolean call(String s) { return s.contains("a"); } }) .count(); long numBs = logData .filter( new Function<String, Boolean>() { public Boolean call(String s) { return s.contains("b"); } }) .count(); System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); }
@Test public void testJavaSparkContextFunctions() throws Exception { SparkContext mockSparkContext = mock(SparkContext.class); JavaSparkContext mockJavaSparkContext = mock(JavaSparkContext.class); when(mockJavaSparkContext.sc()).thenReturn(mockSparkContext); GemFireJavaSparkContextFunctions wrapper = javaFunctions(mockJavaSparkContext); assertTrue(mockSparkContext == wrapper.sc); }
@Test public void testJavaFunctions1() throws Exception { SparkContext sc = mock(SparkContext.class); JavaSparkContext jsc = mock(JavaSparkContext.class); when(jsc.sc()).thenReturn(sc); SparkContextJavaFunctions scjf = javaFunctions(jsc); assertThat(scjf.sparkContext, is(jsc.sc())); }
/** * @param sqlctx * @param mb * @param schema * @return * @throws DMLRuntimeException */ @SuppressWarnings("resource") private DataFrame createDataFrame( SQLContext sqlctx, MatrixBlock mb, boolean containsID, ValueType[] schema) throws DMLRuntimeException { // create in-memory list of rows List<Row> list = new ArrayList<Row>(); int off = (containsID ? 1 : 0); int clen = mb.getNumColumns() + off - colsVector + 1; for (int i = 0; i < mb.getNumRows(); i++) { Object[] row = new Object[clen]; if (containsID) row[0] = i + 1; for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) { if (schema[j2] != ValueType.OBJECT) { row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j)); } else { double[] tmp = DataConverter.convertToDoubleVector( mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock())); row[j2 + off] = new DenseVector(tmp); j += colsVector - 1; } } list.add(RowFactory.create(row)); } // create data frame schema List<StructField> fields = new ArrayList<StructField>(); if (containsID) fields.add( DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); for (int j = 0; j < schema.length; j++) { DataType dt = null; switch (schema[j]) { case STRING: dt = DataTypes.StringType; break; case DOUBLE: dt = DataTypes.DoubleType; break; case INT: dt = DataTypes.LongType; break; case OBJECT: dt = new VectorUDT(); break; default: throw new RuntimeException("Unsupported value type."); } fields.add(DataTypes.createStructField("C" + (j + 1), dt, true)); } StructType dfSchema = DataTypes.createStructType(fields); // create rdd and data frame JavaSparkContext sc = new JavaSparkContext(sqlctx.sparkContext()); JavaRDD<Row> rowRDD = sc.parallelize(list); return sqlctx.createDataFrame(rowRDD, dfSchema); }
public static void main(String[] args) { if (args.length == 0) { System.err.println("Usage: Main <file>"); System.exit(1); } SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(args[0]); JavaPairRDD<String, Double> dayArrivalDelayPair = lines.flatMapToPair( line -> { String[] splitLine = line.split(SPLIT_PATTERN); String key = splitLine.length == 0 ? "" : splitLine[0]; Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]); return Arrays.asList(new Tuple2<>(key, value)); }); JavaPairRDD<String, AverageWrapper> dayAverageWrapper = dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1)); JavaPairRDD<String, AverageWrapper> daysValueCount = dayAverageWrapper.reduceByKey( (aw1, aw2) -> new AverageWrapper( aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount())); Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap(); List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>(); listResults.addAll(resultMap.entrySet()); Collections.sort( listResults, (entry1, entry2) -> Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue())); for (Map.Entry<String, AverageWrapper> entry : listResults) { System.out.printf( "%s -> (%f, %d)\n", entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount()); } // JavaPairRDD<String, Double> resultRDD = // daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() / // averageWrapper.getCount()); // // Map<String, Double> results = resultRDD.collectAsMap(); // List<Map.Entry<String, Double>> listResults = new ArrayList<>(); // listResults.addAll(results.entrySet()); // Collections.sort(listResults, (entry1, entry2) -> // entry1.getValue().compareTo(entry2.getValue())); // // for (Map.Entry<String, Double> entry : listResults) { // System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue()); // } }
public static void main(String[] args) { String master = args[0]; String appName = args[1]; String path = args[2]; SparkConf conf = new SparkConf().setAppName(appName).setMaster(master); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(path) .filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return !s.isEmpty() && !s.contains("Total"); } }); JavaRDD<String> usOnly = lines.filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return s.contains("United States"); } }); JavaPairRDD<String, Integer> yearAndMedals = usOnly.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { String[] fields = s.split(","); return new Tuple2<String, Integer>(fields[3], Integer.parseInt(fields[8])); } }); JavaPairRDD<String, Integer> reduced = yearAndMedals.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer accumulator, Integer currentValue) throws Exception { return accumulator + currentValue; } }); JavaPairRDD<String, Integer> result = reduced.filter( new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> tuple) throws Exception { return tuple._2 < 200; } }); System.out.println(); System.out.println(result.collect()); }
public static boolean SpatialRangeQuery( String InputLocation1, String InputLocation2, String OutputLocation) { SparkConf sparkConfiguration = new SparkConf().setAppName("Group22-RangeQuery"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConfiguration); boolean result = getRangeQuery(InputLocation1, InputLocation2, OutputLocation, sparkContext); sparkContext.close(); return result; }
/** * @param pfid * @param program * @param taskFile * @param resultFile * @param enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException * @throws DMLUnsupportedOperationException */ public static RemoteParForJobReturn runJob( long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, ExecutionContext ec, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, // config params boolean enableCPCaching, int numReducers) // opt params throws DMLRuntimeException, DMLUnsupportedOperationException { String jobname = "ParFor-DPESP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext) ec; JavaSparkContext sc = sec.getSparkContext(); // prepare input parameters MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData(); MatrixCharacteristics mc = md.getMatrixCharacteristics(); InputInfo ii = InputInfo.BinaryBlockInputInfo; // initialize accumulators for tasks/iterations Accumulator<Integer> aTasks = sc.accumulator(0); Accumulator<Integer> aIters = sc.accumulator(0); JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar); DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf); RemoteDPParForSparkWorker efun = new RemoteDPParForSparkWorker( program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters); List<Tuple2<Long, String>> out = in.flatMapToPair(dpfun) // partition the input blocks .groupByKey(numReducers) // group partition blocks .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup .collect(); // get output handles // de-serialize results LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG); int numTasks = aTasks.value(); // get accumulator value int numIters = aIters.value(); // get accumulator value // create output symbol table entries RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results); // maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if (DMLScript.STATISTICS) { Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0); } return ret; }
/** Load the data from the json file and return an RDD of Tweet */ public JavaRDD<Tweet> loadData() { // create spark configuration and spark context SparkConf conf = new SparkConf().setAppName("Tweet mining").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(line -> Parse.parseJsonToTweet(line)); return tweets; }
public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext("local", "JavaAPISuite"); JavaRDD<String> lines = sc.textFile("log.txt"); JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" "))); JavaPairRDD<String, Integer> counts = words.mapToPair(w -> new Tuple2<String, Integer>(w, 1)).reduceByKey((x, y) -> x + y); counts.collect().forEach(t -> System.out.println("Key:" + t._1() + " Value:" + t._2())); }
public static void main(String[] args) { // Handle invalid arguments.. if (args.length < 2) { System.out.println("Usage: ConvexHull arg1 arg2"); System.out.println("arg1: input dataset A file path [points]"); System.out.println("arg2: output file name and path"); System.exit(1); } // Creating and setting sparkconf SparkConf sparkConf = new SparkConf().setAppName("Group3-edu.asu.cse512.ConvexHull"); JavaSparkContext sc = new JavaSparkContext(sparkConf); // Adding external jars // sc.addJar("lib/jts-1.13.jar"); JavaRDD<String> lines = sc.textFile(args[0]); // Using mapPartitions function to find convex hull points in distributed environment JavaRDD<Coordinate> hullPointsRDD = lines.mapPartitions(new ConvexH()); List<Coordinate> hullPointsList = hullPointsRDD.collect(); Coordinate[] inputArray = new Coordinate[hullPointsList.size()]; int j = 0; for (Coordinate c : hullPointsList) { inputArray[j] = c; j++; } // Finding convex hull points on the final subset of points retrieved from distributed // environment GeometryFactory geoFactory1 = new GeometryFactory(); MultiPoint mPoint1 = geoFactory1.createMultiPoint(inputArray); Geometry geo1 = mPoint1.convexHull(); Coordinate[] convexHullResult = geo1.getCoordinates(); int length = convexHullResult.length; Coordinate[] convexHullFinalResult = Arrays.copyOf(convexHullResult, length - 1); Arrays.sort(convexHullFinalResult); // Converting the list of coordinates into Coordinate RDD JavaRDD<Coordinate> convexHullResultRDD = sc.parallelize(Arrays.asList(convexHullFinalResult), 1); JavaRDD<String> convexHullResultString = convexHullResultRDD .repartition(1) .map( new Function<Coordinate, String>() { public String call(Coordinate hullPoint) throws Exception { return hullPoint.x + "," + hullPoint.y; } }); // Save the String RDD into text file. Using repartition(1) to preserve the order of coordinates convexHullResultString.repartition(1).saveAsTextFile(args[1]); }
public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext("local", "test-java-client"); // JavaSparkContext sc = new JavaSparkContext("spark://192.168.181.23:7077", // "test-java-client"); JavaRDD<String> textFile = sc.textFile("/home/congsl/apps/storm/dockerfile-repository/nginx/Dockerfile"); Map<String, Integer> result = textFile .flatMap( new FlatMapFunction<String, Object>() { @Override public Iterable<Object> call(String s) throws Exception { System.out.println(s); return Arrays.asList(s.split(" ")); } }) .map( new Function<Object, Map<String, Integer>>() { @Override public Map<String, Integer> call(Object v1) throws Exception { System.out.println(v1); Map<String, Integer> map = new HashMap<String, Integer>(); map.put(v1.toString(), 1); return map; } }) .reduce( new Function2<Map<String, Integer>, Map<String, Integer>, Map<String, Integer>>() { @Override public Map<String, Integer> call(Map<String, Integer> v1, Map<String, Integer> v2) throws Exception { System.out.println("v1:" + v1); System.out.println("v2:" + v2); for (String key : v2.keySet()) { if (v1.get(key) == null) { v1.put(key, v2.get(key)); } else { v1.put(key, v1.get(key) + v2.get(key)); } } return v1; } }); System.out.println(result); System.out.println(textFile.count()); SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); DataFrame df = sqlContext.jdbc( "jdbc:mysql://localhost:3306/activiti?user=root&password=admin", "ACT_ID_INFO"); df.show(); // JavaSQLContext sqlContext = new JavaSQLContext(sparkContext); }
public static void main(String args[]) { SparkConf conf = new SparkConf().setAppName("KeyValueTest").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaRDD<String> lines = jsc.textFile("/home/piyushm/samplejson.json"); List<Person> persons = lines.mapPartitions(new ParseJson()).collect(); JavaRDD<Person> personJavaRDD = jsc.parallelize(persons); JavaRDD<String> csvFileContent = jsc.textFile("/opt/sample.csv"); System.out.println(csvFileContent.map(new ParseLine()).collect()); System.out.println(persons); System.out.println(personJavaRDD.mapPartitions(new WriteJson()).collect()); jsc.stop(); }
/** "Hello World" spark job that counts the number of each word in a word list. */ @Test public void helloWorld() { JavaSparkContext sc = subject.getContext(); Map<String, Integer> content = sc.parallelize(Arrays.asList("one", "two", "two", "three", "three", "three")) .mapToPair(new ToItemCounterPair()) .reduceByKey(new Sum()) .collectAsMap(); assertThat(content.get("one"), is(1)); assertThat(content.get("two"), is(2)); assertThat(content.get("three"), is(3)); }
public static void main(String[] args) { String file = ""; SparkConf conf = new SparkConf().setAppName("app"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(file); JavaRDD<String> errors = lines.filter( new Function<String, Boolean>() { public Boolean call(String x) { return x.contains("error"); } }); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Distinct"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Integer> nums = sc.parallelize( Arrays.asList(1, 2, 3, 4, 5, 1, 3, 2, 2, 1, 3, 4, 5, 5, 4, 3, 1, 2, 3, 2, 6, 8, 0)); JavaRDD<Integer> distinct = nums.distinct(); System.out.println(StringUtils.join(distinct.collect(), ",")); sc.close(); }
public static void main(String[] args) throws IOException { SparkConf config = new SparkConf().setAppName("003-distributed-matrices").setMaster("local[*]"); try (JavaSparkContext sc = new JavaSparkContext(config)) { /* Create a RowMatrix */ List<Vector> vectors = new ArrayList<>(10); for (int i = 0; i < 10; i++) { vectors.add(Vectors.dense(getVectorElements())); } JavaRDD<Vector> rowsRDD = sc.parallelize(vectors, 4); RowMatrix rowMatrix = new RowMatrix(rowsRDD.rdd()); System.out.println(rowMatrix.toString()); /* Create an IndexedRowMatrix */ JavaRDD<IndexedRow> indexedRows = sc.parallelize( Arrays.asList(new IndexedRow(0, vectors.get(0)), new IndexedRow(1, vectors.get(1)))); IndexedRowMatrix indexedRowMatrix = new IndexedRowMatrix(indexedRows.rdd()); System.out.println(indexedRowMatrix); /* convert */ JavaRDD<IndexedRow> indexedRowsFromRowMatrix = rowMatrix .rows() .toJavaRDD() .zipWithIndex() .map((Tuple2<Vector, Long> t) -> new IndexedRow(t._2(), t._1())); IndexedRowMatrix indexedRowMatrixFromRowMatrix = new IndexedRowMatrix(indexedRowsFromRowMatrix.rdd()); System.out.println(indexedRowMatrixFromRowMatrix); /* Create a CoordinateMatrix * M = [ 5 0 1 * 0 3 4 ] */ JavaRDD<MatrixEntry> matrixEntries = sc.parallelize( Arrays.asList( new MatrixEntry(0, 0, 5.), new MatrixEntry(1, 1, 3.), new MatrixEntry(2, 0, 1.), new MatrixEntry(2, 1, 4.))); CoordinateMatrix coordMatrix = new CoordinateMatrix(matrixEntries.rdd()); System.out.println(coordMatrix); printSeparator(); } }
public static void main(String[] args) throws Exception { if (args.length < 2) { throw new IllegalArgumentException( "The number of arguments is incorrect. Usage:\n" + " <configuration file (conf.xml) path> <job file (.analysis.xml) path> [properties file path]\n" + "Got: " + Arrays.toString(args)); } final SparkConf conf = new SparkConf().setAppName("DataCleaner-spark"); final JavaSparkContext sparkContext = new JavaSparkContext(conf); final URI confXmlPath = URI.create(args[0]); final URI analysisJobXmlPath = URI.create(args[1]); final URI propertiesPath; if (args.length > 2) { propertiesPath = URI.create(args[2]); } else { propertiesPath = null; } final SparkJobContext sparkJobContext = new SparkJobContext(confXmlPath, analysisJobXmlPath, propertiesPath, sparkContext); final ServiceLoader<SparkJobLifeCycleListener> listenerLoaders = ServiceLoader.load(SparkJobLifeCycleListener.class); for (SparkJobLifeCycleListener listener : listenerLoaders) { sparkJobContext.addSparkJobLifeCycleListener(listener); } final SparkAnalysisRunner sparkAnalysisRunner = new SparkAnalysisRunner(sparkContext, sparkJobContext); try { final AnalysisResultFuture result = sparkAnalysisRunner.run(); result.await(); if (sparkJobContext.isResultEnabled()) { final Resource resultResource = ResultFilePathUtils.getResultResource(sparkContext, sparkJobContext); logger.info("DataCleaner result will be written to: {}", resultResource); saveResult(result, resultResource); } else { logger.info("DataCleaner result will not be written - disabled"); } } finally { sparkContext.stop(); } }
public static void main(String[] args) { JavaSparkContext javaSparkContext = SparkConfSetup.getJavaSparkContext(); CassandraConnector connector = SparkConfSetup.getCassandraConnector(); basicCassandraSession(connector); writePeopleToCassandra(javaSparkContext); readPeopleFromCassandra(javaSparkContext); javaSparkContext.stop(); }