public static void main(String[] args) { if (args.length < 4) { System.err.println("Usage: PDCKafkaConsumer <zkQuorum> <group> <topics> <numThreads>"); System.exit(1); } String zkQuorum = args[0]; String kfGrp = args[1]; String[] topics = args[2].split(","); int numThreads = Integer.valueOf(args[3]); Map<String, Integer> topicMap = new HashMap<String, Integer>(); for (String topic : topics) { topicMap.put(topic, numThreads); } SparkConf conf = new SparkConf().setAppName("PDCKafkaConsumer"); conf.set("spark.ui.port", "4040"); JavaStreamingContext ctx = new JavaStreamingContext(conf, new Duration(10000)); JavaPairReceiverInputDStream<String, String> kfStream = KafkaUtils.createStream(ctx, zkQuorum, kfGrp, topicMap); kfStream.saveAsHadoopFiles( "/phasor/pmu/pdc", "in", Text.class, Text.class, TextOutputFormat.class); ctx.start(); ctx.awaitTermination(); }
private static JavaStreamingContext createContext(String input, String checkpointDirectory) { System.out.println("Creating new context"); // final File outputFile = new File("/flume_recover"); // if (outputFile.exists()) { // outputFile.delete(); // } SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("Stream File") .set("spark.driver.allowMultipleContexts", "true"); conf.set("spark.serializer", KryoSerializer.class.getName()); conf.set("es.index.auto.create", "true"); conf.set("es.nodes", "10.26.1.134:9200"); conf.set("es.resource", "flume/test"); conf.set("es.input.json", "true"); JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(3000)); jssc.checkpoint(checkpointDirectory); JavaDStream<String> textFile = jssc.textFileStream(input); JavaDStream<String> jsonStr = textFile.map( new Function<String, String>() { public String call(String arg0) throws Exception { Matcher m = log.matcher(arg0); if (m.find()) { return transferJson(m); } return ""; } }); jsonStr.print(); jsonStr.foreach( new Function<JavaRDD<String>, Void>() { public Void call(JavaRDD<String> arg0) throws Exception { if (!arg0.isEmpty() && arg0 != null) { JavaEsSpark.saveToEs(arg0, "flume/test"); } return null; } }); return jssc; }
public static void create(final Configuration configuration) { final SparkConf sparkConf = new SparkConf(); configuration .getKeys() .forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString())); sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); CONTEXT = SparkContext.getOrCreate(sparkConf); }
public SparkOperatorCreater(String appName) throws IOException { super(appName); properties = new Properties(); properties.load( this.getClass().getClassLoader().getResourceAsStream("spark-cluster.properties")); SparkConf conf = new SparkConf().setMaster(this.getMaster()).setAppName(appName); conf.set("spark.streaming.ui.retainedBatches", "2000"); jssc = new JavaStreamingContext(conf, Durations.milliseconds(this.getDurationsMilliseconds())); }
private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) { if (contextOptions.getUsesProvidedSparkContext()) { LOG.info("Using a provided Spark Context"); JavaSparkContext jsc = contextOptions.getProvidedSparkContext(); if (jsc == null || jsc.sc().isStopped()) { LOG.error("The provided Spark context " + jsc + " was not created or was stopped"); throw new RuntimeException("The provided Spark context was not created or was stopped"); } return jsc; } else { LOG.info("Creating a brand new Spark Context."); SparkConf conf = new SparkConf(); if (!conf.contains("spark.master")) { // set master if not set. conf.setMaster(contextOptions.getSparkMaster()); } conf.setAppName(contextOptions.getAppName()); // register immutable collections serializers because the SDK uses them. conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName()); conf.set("spark.serializer", KryoSerializer.class.getName()); return new JavaSparkContext(conf); } }
@Override public int run(SparkConf conf, CommandLine cli) throws Exception { long startMs = System.currentTimeMillis(); conf.set("spark.ui.enabled", "false"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); long diffMs = (System.currentTimeMillis() - startMs); System.out.println(">> took " + diffMs + " ms to create SQLContext"); Map<String, String> options = new HashMap<>(); options.put("zkhost", "localhost:9983"); options.put("collection", "ml20news"); options.put("query", "content_txt:[* TO *]"); options.put("fields", "content_txt"); DataFrame solrData = sqlContext.read().format("solr").options(options).load(); DataFrame sample = solrData.sample(false, 0.1d, 5150).select("content_txt"); List<Row> rows = sample.collectAsList(); System.out.println(">> loaded " + rows.size() + " docs to classify"); StructType schema = sample.schema(); CrossValidatorModel cvModel = CrossValidatorModel.load("ml-pipeline-model"); PipelineModel bestModel = (PipelineModel) cvModel.bestModel(); int r = 0; startMs = System.currentTimeMillis(); for (Row next : rows) { Row oneRow = RowFactory.create(next.getString(0)); DataFrame oneRowDF = sqlContext.createDataFrame(Collections.<Row>singletonList(oneRow), schema); DataFrame scored = bestModel.transform(oneRowDF); Row scoredRow = scored.collect()[0]; String predictedLabel = scoredRow.getString(scoredRow.fieldIndex("predictedLabel")); // an acutal app would save the predictedLabel // System.out.println(">> for row["+r+"], model returned "+scoredRows.length+" rows, // "+scoredRows[0]); r++; } diffMs = (System.currentTimeMillis() - startMs); System.out.println(">> took " + diffMs + " ms to score " + rows.size() + " docs"); return 0; }
public static void main(String[] args) { SparkConf conf = new SparkConf() .setMaster("local[1]") .setAppName(RDDParallelizeSample.class.getSimpleName()); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaSparkContext sc = new JavaSparkContext(conf); // create a List of Characters List<Character> characterList = new ArrayList<Character>(); characterList.addAll( Arrays.asList( 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q')); // create an RDD from an internal List using parallelize method JavaRDD<Character> characterRDD = sc.parallelize(characterList); System.out.println("list size : " + characterList.size()); System.out.println("rdd size : " + characterRDD.count()); System.out.println("list content : " + characterList); System.out.println("rdd content : " + characterRDD.collect()); }
public static void main(String args[]) { SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); conf.set("es.index.auto.create", "true"); JavaSparkContext context = new JavaSparkContext(conf); JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv"); JavaRDD<Crime> dataSplits = textFile.map( line -> { CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180); Crime c = new Crime(); CSVRecord record = parser.getRecords().get(0); c.setId(record.get(0)); c.setCaseNumber(record.get(1)); c.setEventDate(record.get(2)); c.setBlock(record.get(3)); c.setIucr(record.get(4)); c.setPrimaryType(record.get(5)); c.setDescription(record.get(6)); c.setLocation(record.get(7)); c.setArrest(Boolean.parseBoolean(record.get(8))); c.setDomestic(Boolean.parseBoolean(record.get(9))); String lat = record.get(10); String lon = record.get(11); Map<String, Double> geoLocation = new HashMap<>(); geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat)); geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon)); c.setGeoLocation(geoLocation); return c; }); SQLContext sqlContext = new SQLContext(context); DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class); JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection"); }
/** Main method for performing the random partition based model ensembler evaluation */ public static void main(String[] args) { // Construction of Spark Configuration SparkConf sContext = new SparkConf(); sContext.setMaster("local[4]"); sContext.setAppName("JavaLR"); sContext.set("spark.executor.memory", "4G"); // Creates the spark context sc = new JavaSparkContext(sContext); // "local[4]", "JavaLR"); // Load train and test data JavaRDD<String> trainingData = readData("/Users/erangap/Documents/ML_Project/datasets/trainImputedNormalized.csv", "Id") .sample(false, 0.1, 11L); JavaRDD<String> testdata = readData("/Users/erangap/Documents/ML_Project/datasets/testImputedNormalized.csv", "Id") .sample(false, 0.1, 11L); // trainingData.saveAsTextFile("/Users/erangap/Documents/ML_Project/datasets/reduced.csv"); JavaRDD<LabeledPoint> points = trainingData.map(new ParsePoint()); // points.persist(StorageLevel.MEMORY_AND_DISK()); // System.out.println(points.first().features()); JavaRDD<LabeledPoint> testPoints = testdata.map(new ParsePoint()); // testPoints.persist(StorageLevel.MEMORY_AND_DISK()); System.out.println("Total number of records -> " + points.count()); RandomPartitionedEnSembler ensembler = new RandomPartitionedEnSembler(); ensembler.setNoofModels(32); ensembler.setThreshold(0.499999); // Perform the training DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); Date trainStartTime = Calendar.getInstance().getTime(); String trainStart = dateFormat.format(trainStartTime); ensembler.train(points); Date trainEndTime = Calendar.getInstance().getTime(); String trainEnd = dateFormat.format(trainEndTime); // Training time calculations and console print long trainElapsed = (trainEndTime.getTime() - trainStartTime.getTime()) / 1000; System.out.println("Training Started at -> " + trainStart); System.out.println("Training Ended at -> " + trainEnd); System.out.println("Time Taken to Train -> " + trainElapsed + " Sec."); // Prepare data for testing JavaRDD<Double> testingLabels = testPoints .map( new Function<LabeledPoint, Double>() { private static final long serialVersionUID = -6597374940461185814L; public Double call(LabeledPoint dataPoint) throws Exception { return dataPoint.label(); } }) .cache(); List<Double> classLabels = testingLabels.toArray(); // Perform the predictions Date predictStartTime = Calendar.getInstance().getTime(); String predictStart = dateFormat.format(predictStartTime); List<Double> predictedLabels = ensembler.voteAndPredit(testPoints).toArray(); Date predictEndTime = Calendar.getInstance().getTime(); String predictEnd = dateFormat.format(predictEndTime); // Predict time calculations and console print long preditElapsed = (predictEndTime.getTime() - predictStartTime.getTime()) / 1000; System.out.println("Prediction Started at -> " + predictStart); System.out.println("Prediction Ended at -> " + predictEnd); System.out.println("Time Taken to Predit -> " + preditElapsed + " Sec."); // Calculate and Display the accuracy System.out.println("Testing accuracy (%): " + Metrics.accuracy(classLabels, predictedLabels)); BinaryClassificationMetrics binaryClassificationMetrics = getBinaryClassificationMatrix(ensembler, testPoints); System.out.println("Area under the curve -> " + binaryClassificationMetrics.areaUnderROC()); }
public static void main(String[] args) throws IOException { Parameters param = new Parameters(); long initTime = System.currentTimeMillis(); SparkConf conf = new SparkConf().setAppName("StarJoin"); JavaSparkContext sc = new JavaSparkContext(conf); if (param.useKryo) { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName()); conf.set("spark.kryoserializer.buffer.mb", param.buffer); } MyBloomFilter.BloomFilter<String> BFS = new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes); MyBloomFilter.BloomFilter<String> BFD = new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes); MyBloomFilter.BloomFilter<String> BFC = new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes); JavaPairRDD<String, String> supps = sc.textFile(param.suppPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> s = supps.collect(); for (int i = 0; i < s.size(); i++) { BFS.add(s.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS); JavaPairRDD<String, String> custs = sc.textFile(param.custPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> c = custs.collect(); for (int i = 0; i < c.size(); i++) { BFC.add(c.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC); JavaPairRDD<String, String> dates = sc.textFile(param.datePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[6].equals("Dec1997"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[4]); } }); List<Tuple2<String, String>> d = dates.collect(); for (int i = 0; i < d.size(); i++) { BFD.add(d.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD); JavaPairRDD<String, String[]> lines = sc.textFile(param.linePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return varC.value().contains(s[2].getBytes()) & varS.value().contains(s[4].getBytes()) & varD.value().contains(s[5].getBytes()); } }) .mapToPair( new PairFunction<String[], String, String[]>() { public Tuple2<String, String[]> call(String[] s) { String[] v = {s[2], s[5], s[12]}; return new Tuple2<String, String[]>(s[4], v); } }); JavaPairRDD<String, String[]> result = lines .join(supps) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); result = result .join(custs) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); JavaPairRDD<String, Long> final_result = result .join(dates) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) { return new Tuple2<String, Long>( s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0])); } }) .reduceByKey( new Function2<Long, Long, Long>() { public Long call(Long i1, Long i2) { return i1 + i2; } }); JavaPairRDD<String, String> sub_result = final_result.mapToPair( new PairFunction<Tuple2<String, Long>, String, String>() { public Tuple2<String, String> call(Tuple2<String, Long> line) { return new Tuple2(line._1 + "," + line._2.toString(), null); } }); final_result = sub_result .sortByKey(new Q3Comparator()) .mapToPair( new PairFunction<Tuple2<String, String>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, String> line) { String[] s = line._1.split(","); return new Tuple2<String, Long>( s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3])); } }); Configuration HDFSconf = new Configuration(); FileSystem fs = FileSystem.get(HDFSconf); fs.delete(new Path(param.output), true); final_result.saveAsTextFile(param.output); long finalTime = System.currentTimeMillis(); System.out.print("Tempo total(ms): "); System.out.println(finalTime - initTime); sc.close(); }
public void run() { System.setProperty("spark.hadoop.dfs.replication", "2"); Logger.getLogger("org").setLevel(Level.OFF); Logger.getLogger("akka").setLevel(Level.OFF); SparkConf conf = new SparkConf().setAppName("WindowingKafkaWordCountWithFaultTolerance"); conf.set("spark.master", PropertiesStack.getProperty("spark.master")); conf.set("spark.executor.memory", PropertiesStack.getProperty("spark.executor.memory")); conf.set("spark.driver.memory", PropertiesStack.getProperty("spark.driver.memory")); conf.set( "spark.driver.maxResultSize", PropertiesStack.getProperty("spark.driver.maxResultSize")); // .setAppName("WindowingKafkaWordCountWithoutFaultTolerance"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10)); HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(PropertiesStack.getKafkaTopic())); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", PropertiesStack.getKafkaBootstrapServers()); kafkaParams.put("zookeeper.connect", PropertiesStack.getZookeeperConnect()); kafkaParams.put("auto.offset.reset", "smallest"); kafkaParams.put("group.id", PropertiesStack.getKafkaGroupId()); kafkaParams.put("auto.commit.enable", "false"); Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put(PropertiesStack.getKafkaTopic(), 1); // Map<kafka.common.TopicAndPartition, java.lang.Long> fromOffsets = new HashMap<>(); // fromOffsets.put(new TopicAndPartition(PropertiesStack.getKafkaTopic(), // 1), 1000L); // Create direct kafka stream with brokers and topics // JavaInputDStream<String> messages = KafkaUtils // .createDirectStream( // jssc, // String.class, // String.class, // StringDecoder.class, // StringDecoder.class, // String.class, // kafkaParams, // fromOffsets, // new Function<kafka.message.MessageAndMetadata<String, String>, String>() { // @Override // public String call( // MessageAndMetadata<String, String> v1) // throws Exception { // return v1.message(); // } // }); JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.count().print(); // .createStream(jssc, PropertiesStack.getZookeeperConnect(), // PropertiesStack.getKafkaGroupId(), topicMap); // Start the computation jssc.start(); jssc.awaitTermination(); }