public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Example").setMaster("local[2]"); // Create a StreamingContext with a 1-second batch size from a SparkConf JavaStreamingContext jssc = new JavaStreamingContext( conf, Durations.seconds(10)); // Create a DStream from all the input on port 7777 JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777); // Filter our DStream for lines with "error" // Split up into words. final JavaDStream<String> wordDStream = lines.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { return Arrays.asList(x.split(" ")); } }); final JavaPairDStream<String, Integer> wordPairDStream = wordDStream.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s, 1); } }); final JavaPairDStream<String, Integer> totalWordPairDStream = wordPairDStream.reduceByKeyAndWindow( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer a, Integer b) throws Exception { return a + b; } }, Durations.seconds(30), Durations.seconds(20)); // totalWordPairDStream.foreach(new Function2<JavaPairRDD<String, Integer>, Time, Void>() // { // @Override // public Void call(JavaPairRDD<String, Integer> wordPairRDD, Time time) throws // Exception { // // final List<Tuple2<String, Integer>> collect = wordPairRDD.collect(); // for (Tuple2<String, Integer> t : collect) { // System.out.println("the value is t" + t._1() + "," + t._2()); // } // // return null; // } // }); totalWordPairDStream.print(); // Start our streaming context and wait for it to "finish" jssc.start(); // Wait for the job to finish jssc.awaitTermination(); }
public static void main(String[] args) { if (args.length < 4) { System.err.println("Usage: PDCKafkaConsumer <zkQuorum> <group> <topics> <numThreads>"); System.exit(1); } String zkQuorum = args[0]; String kfGrp = args[1]; String[] topics = args[2].split(","); int numThreads = Integer.valueOf(args[3]); Map<String, Integer> topicMap = new HashMap<String, Integer>(); for (String topic : topics) { topicMap.put(topic, numThreads); } SparkConf conf = new SparkConf().setAppName("PDCKafkaConsumer"); conf.set("spark.ui.port", "4040"); JavaStreamingContext ctx = new JavaStreamingContext(conf, new Duration(10000)); JavaPairReceiverInputDStream<String, String> kfStream = KafkaUtils.createStream(ctx, zkQuorum, kfGrp, topicMap); kfStream.saveAsHadoopFiles( "/phasor/pmu/pdc", "in", Text.class, Text.class, TextOutputFormat.class); ctx.start(); ctx.awaitTermination(); }
@SuppressWarnings("ConstantConditions") JavaDStream<WindowedValue<T>> getDStream() { if (dStream == null) { WindowedValue.ValueOnlyWindowedValueCoder<T> windowCoder = WindowedValue.getValueOnlyCoder(coder); // create the DStream from queue Queue<JavaRDD<WindowedValue<T>>> rddQueue = new LinkedBlockingQueue<>(); JavaRDD<WindowedValue<T>> lastRDD = null; for (Iterable<T> v : values) { Iterable<WindowedValue<T>> windowedValues = Iterables.transform(v, WindowingHelpers.<T>windowValueFunction()); JavaRDD<WindowedValue<T>> rdd = jssc.sc() .parallelize(CoderHelpers.toByteArrays(windowedValues, windowCoder)) .map(CoderHelpers.fromByteFunction(windowCoder)); rddQueue.offer(rdd); lastRDD = rdd; } // create DStream from queue, one at a time, // with last as default in case batches repeat (graceful stops for example). // if the stream is empty, avoid creating a default empty RDD. // mainly for unit test so no reason to have this configurable. dStream = lastRDD != null ? jssc.queueStream(rddQueue, true, lastRDD) : jssc.queueStream(rddQueue, true); } return dStream; }
@Override public void Start() { jssc.addStreamingListener(new PerformanceStreamingListener()); // jssc.checkpoint("/tmp/log-analyzer-streaming"); jssc.checkpoint("hdfs://master:8020/usr/warehouse/wordcount/checkpoint"); jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) { SparkConf conf = new SparkConf() .setMaster("local[4]") .setAppName("SparkStreamingPullDataFromFlume for Java"); JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(30)); // JavaReceiverInputDStream<SparkFlumeEvent> lines = FlumeUtils.createStream(jsc,"master1", // 9999); flume push data to Spark Streaming JavaReceiverInputDStream<SparkFlumeEvent> lines = FlumeUtils.createPollingStream( jsc, "master1", 9999); // Spark Streaming pull data from flume JavaDStream<String> words = lines.flatMap( new FlatMapFunction<SparkFlumeEvent, String>() { private static final long serialVersionUID = 1L; @Override public Iterable<String> call(SparkFlumeEvent event) throws Exception { String line = new String(event.event().getBody().array()); return Arrays.asList(line.split(" ")); } }); JavaPairDStream<String, Integer> pairs = words.mapToPair( new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<String, Integer>(word, 1); } }); JavaPairDStream<String, Integer> wordsCount = pairs.reduceByKey( new Function2< Integer, Integer, Integer>() { // 对相同的Key,进行Value的累计(包括Local和Reducer级别同时Reduce) private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); wordsCount.print(); jsc.start(); jsc.awaitTermination(); jsc.close(); }
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("faebookStream"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(15)); JavaDStream<String> stream = jssc.textFileStream("/Users/sboynenpalli/Desktop/shashankOfficeMackbook/sparkinputfolder"); stream.print(10); ProcessVideoStreamData processData = new ProcessVideoStreamData(); processData.processData(stream); jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) { // Create a Spark Context. SparkConf conf = new SparkConf().setAppName("Activity").set("spark.eventLog.enabled", "true"); ; JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext jssc = new JavaStreamingContext(sc, STREAM_INTERVAL); String TOPIC = "activityevent"; String zkQuorum = "localhost:2181"; String group = "1"; Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put(TOPIC, 1); JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zkQuorum, group, topicMap); // messages.print(); JavaDStream<String> activitydatastream = messages.map( new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); final Long teamWindowDurationMs = Durations.minutes(1).milliseconds(); JavaDStream<Activity> ActivityEntryDStream = activitydatastream.map(Activity::parseFromLine); JavaPairDStream<WithTimestamp<String>, Double> ActivityWindowDStream = ActivityEntryDStream.mapToPair( windows -> new Tuple2<>( WithTimestamp.create( windows.getActivity(), // Apply Fixed Window by rounding the timestamp down to the nearest // multiple of the window size (convertMillsecs(windows.getTimestamp()) / teamWindowDurationMs) * teamWindowDurationMs), windows.getXaxis())) .reduceByKey(SUM_REDUCER); ActivityWindowDStream.print(); jssc.start(); jssc.awaitTermination(); // jssc.close(); sc.stop(); sc.close(); }
private static JavaStreamingContext createContext(String input, String checkpointDirectory) { System.out.println("Creating new context"); // final File outputFile = new File("/flume_recover"); // if (outputFile.exists()) { // outputFile.delete(); // } SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("Stream File") .set("spark.driver.allowMultipleContexts", "true"); conf.set("spark.serializer", KryoSerializer.class.getName()); conf.set("es.index.auto.create", "true"); conf.set("es.nodes", "10.26.1.134:9200"); conf.set("es.resource", "flume/test"); conf.set("es.input.json", "true"); JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(3000)); jssc.checkpoint(checkpointDirectory); JavaDStream<String> textFile = jssc.textFileStream(input); JavaDStream<String> jsonStr = textFile.map( new Function<String, String>() { public String call(String arg0) throws Exception { Matcher m = log.matcher(arg0); if (m.find()) { return transferJson(m); } return ""; } }); jsonStr.print(); jsonStr.foreach( new Function<JavaRDD<String>, Void>() { public Void call(JavaRDD<String> arg0) throws Exception { if (!arg0.isEmpty() && arg0 != null) { JavaEsSpark.saveToEs(arg0, "flume/test"); } return null; } }); return jssc; }
public static void setJscStartFlag() { if (jsscStartFlag) { m_jssc.stop(); jsscStartFlag = false; m_jssc = null; } jscStartFlag = true; }
public static void main(String[] args) { if (args.length < 1) { System.out.println("Format Error : [File Path]"); } final String filePath = args[0]; final String checkpointDirectory = "/flume_recover"; // final String outputPath = "/flume_recover"; JavaStreamingContextFactory factory = new JavaStreamingContextFactory() { public JavaStreamingContext create() { return createContext(filePath, checkpointDirectory); } }; JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDirectory, factory); jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("kafka-sandbox").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000)); Set<String> topics = Collections.singleton("mytopic"); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", "sandbox.hortonworks.com:6667"); JavaPairInputDStream<String, byte[]> directKafkaStream = KafkaUtils.createDirectStream( ssc, String.class, byte[].class, StringDecoder.class, DefaultDecoder.class, kafkaParams, topics); directKafkaStream.foreachRDD( rdd -> { rdd.foreach( avroRecord -> { Schema.Parser parser = new Schema.Parser(); Schema schema = parser.parse(AvroVulabProducer.USER_SCHEMA); Injection<GenericRecord, byte[]> recordInjection = GenericAvroCodecs.toBinary(schema); GenericRecord record = recordInjection.invert(avroRecord._2).get(); System.out.println( "str1= " + record.get("str1") + ", str2= " + record.get("str2") + ", int1=" + record.get("int1")); }); }); ssc.start(); ssc.awaitTermination(); }
@After public void tearDown() { if (ssc != null) { ssc.stop(); ssc = null; } if (kafkaTestUtils != null) { kafkaTestUtils.teardown(); kafkaTestUtils = null; } }
public static void main(String[] args) { if (args.length != 3) { System.err.println("Usage: JavaFlumeEventCount <master> <host> <port>"); System.exit(1); } String master = args[0]; String host = args[1]; int port = Integer.parseInt(args[2]); Duration batchInterval = new Duration(2000); JavaStreamingContext ssc = new JavaStreamingContext( master, "FlumeEventCount", batchInterval, System.getenv("SPARK_HOME"), JavaStreamingContext.jarOfClass(JavaFlumeEventCount.class)); JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, "localhost", port); flumeStream.count(); flumeStream .count() .map( new Function<Long, String>() { @Override public String call(Long in) { return "Received " + in + " flume events."; } }) .print(); ssc.start(); }
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaQueueStream <master>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context JavaStreamingContext ssc = new JavaStreamingContext( args[0], "QueueStream", new Duration(1000), System.getenv("SPARK_HOME"), JavaStreamingContext.jarOfClass(JavaQueueStream.class)); // Create the queue through which RDDs can be pushed to // a QueueInputDStream Queue<JavaRDD<Integer>> rddQueue = new LinkedList<JavaRDD<Integer>>(); // Create and push some RDDs into the queue List<Integer> list = Lists.newArrayList(); for (int i = 0; i < 1000; i++) { list.add(i); } for (int i = 0; i < 30; i++) { rddQueue.add(ssc.sparkContext().parallelize(list)); } // Create the QueueInputDStream and use it do some processing JavaDStream<Integer> inputStream = ssc.queueStream(rddQueue); JavaPairDStream<Integer, Integer> mappedStream = inputStream.mapToPair( new PairFunction<Integer, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Integer i) { return new Tuple2<Integer, Integer>(i % 10, 1); } }); JavaPairDStream<Integer, Integer> reducedStream = mappedStream.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); reducedStream.print(); ssc.start(); ssc.awaitTermination(); }
public static void main(String[] args) { String master = System.getenv("MASTER"); if (master == null) { master = "local[2]"; } SparkConf conf = new SparkConf().setAppName("Voter Application").setMaster(master); Logger.getLogger("org").setLevel(Level.ERROR); Logger.getLogger("akka").setLevel(Level.ERROR); final Long batch_duration = Long.valueOf(args[0]); JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(Integer.valueOf(args[0]))); jssc.checkpoint("."); JavaReceiverInputDStream<String> votes = jssc.receiverStream(new Voter("localhost", 6789)); // transform text line stream to PhoneCall stream JavaDStream<PhoneCall> phoneCalls = votes.map( new Function<String, PhoneCall>() { public PhoneCall call(String s) { return getPhoneCall(s); } }); JavaDStream<Long> counts = votes.count(); counts.print(); // create updateFunction which is used to update the total call count for each phone number Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction = new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() { public Optional<Integer> call(List<Integer> values, Optional<Integer> state) { // add the new values with the previous running count to get the // new count Integer sum = 0; for (Integer i : values) { sum += i; } Integer newSum = sum + state.or(0); return Optional.of(newSum); } }; // JavaPairDStream<Long, Integer> calls = phoneCalls.mapToPair( new PairFunction<PhoneCall, Long, Integer>() { public Tuple2<Long, Integer> call(PhoneCall x) { return new Tuple2<Long, Integer>(x.phoneNumber, 1); } }); // generate the accumulated count for phone numbers final JavaPairDStream<Long, Integer> callNumberCounts = calls.updateStateByKey(updateFunction); // callNumberCounts.print(); JavaPairDStream<Long, PhoneCall> pairVotes = phoneCalls.mapToPair( new PairFunction<PhoneCall, Long, PhoneCall>() { public Tuple2<Long, PhoneCall> call(PhoneCall call) throws Exception { return new Tuple2<Long, PhoneCall>(call.voteId, call); } }); // generate the validate phone numbers, which is still allowed to send vote JavaPairDStream<Long, Integer> allowedCalls = callNumberCounts.filter( new Function<Tuple2<Long, Integer>, Boolean>() { public Boolean call(Tuple2<Long, Integer> v1) throws Exception { if (v1._2() > Voter.MAX_VOTES) return false; return true; } }); // allowedCalls.print(); // get validate contestant phone calls JavaDStream<PhoneCall> validContestantPhoneCalls = phoneCalls.filter( new Function<PhoneCall, Boolean>() { public Boolean call(PhoneCall call) { if (call.contestantNumber > Voter.NUM_CONTESTANTS) return false; return true; } }); JavaPairDStream<Long, PhoneCall> anotherTemporyPhoneCalls = validContestantPhoneCalls.mapToPair( new PairFunction<PhoneCall, Long, PhoneCall>() { public Tuple2<Long, PhoneCall> call(PhoneCall x) { return new Tuple2<Long, PhoneCall>(x.phoneNumber, x); } }); // get validate phone call records JavaPairDStream<Long, Tuple2<PhoneCall, Integer>> validatePhoneCalls = anotherTemporyPhoneCalls.join(allowedCalls); // validatePhoneCalls.print(); JavaDStream<PhoneCall> validateCalls = validatePhoneCalls.transform( new Function<JavaPairRDD<Long, Tuple2<PhoneCall, Integer>>, JavaRDD<PhoneCall>>() { public JavaRDD<PhoneCall> call(JavaPairRDD<Long, Tuple2<PhoneCall, Integer>> v1) throws Exception { JavaRDD<PhoneCall> item = v1.map( new Function<Tuple2<Long, Tuple2<PhoneCall, Integer>>, PhoneCall>() { public PhoneCall call(Tuple2<Long, Tuple2<PhoneCall, Integer>> validItem) throws Exception { return validItem._2()._1(); } }); return item; } }); // validateCalls.print(); // save all votes with redis validateCalls.foreachRDD( new Function<JavaRDD<PhoneCall>, Void>() { public Void call(JavaRDD<PhoneCall> rdd) throws Exception { rdd.foreach( new VoidFunction<PhoneCall>() { public void call(PhoneCall call) throws Exception { // System.out.println(call.toString()); String key = String.valueOf(call.voteId); String value = call.getContent(); // save <key,value> using redis JedisPool pool = new JedisPool(new JedisPoolConfig(), "localhost"); Jedis jedis = pool.getResource(); try { jedis.set(key, value); } finally { if (null != jedis) { jedis.close(); } } /// ... when closing your application: pool.destroy(); } }); return null; } }); // validate calls JavaPairDStream<Integer, Integer> contestantVotes = validateCalls.mapToPair( new PairFunction<PhoneCall, Integer, Integer>() { public Tuple2<Integer, Integer> call(PhoneCall x) { return new Tuple2<Integer, Integer>(x.contestantNumber, 1); } }); // use window to get generate leaderboard Integer size = Integer.valueOf(args[1]); Integer slide = Integer.valueOf(args[2]); JavaDStream<PhoneCall> windowCalls = validateCalls.window(new Duration(size), new Duration(slide)); // windowCalls.print(); // generate window contestant count JavaPairDStream<Integer, Integer> windowContestantNums = windowCalls.mapToPair( new PairFunction<PhoneCall, Integer, Integer>() { public Tuple2<Integer, Integer> call(PhoneCall x) { return new Tuple2<Integer, Integer>(x.contestantNumber, 1); } }); JavaPairDStream<Integer, Integer> windContestantCounts = windowContestantNums.reduceByKey( new Function2<Integer, Integer, Integer>() { public Integer call(Integer i1, Integer i2) throws Exception { return i1 + i2; } }); windContestantCounts.print(); // generate the accumulated count for contestants JavaPairDStream<Integer, Integer> totalContestantCounts = contestantVotes.updateStateByKey(updateFunction); // used for sorting PairFunction<Tuple2<Integer, Integer>, Integer, Integer> swapFunction = new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) { return in.swap(); } }; JavaPairDStream<Integer, Integer> swappedTotalContestantCounts = totalContestantCounts.mapToPair(swapFunction); JavaPairDStream<Integer, Integer> sortedTotalContestantCounts = swappedTotalContestantCounts.transformToPair( new Function<JavaPairRDD<Integer, Integer>, JavaPairRDD<Integer, Integer>>() { public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) throws Exception { return in.sortByKey(false); } }); sortedTotalContestantCounts.print(); // make some statistics phoneCalls.foreachRDD( new Function<JavaRDD<PhoneCall>, Void>() { public Void call(JavaRDD<PhoneCall> rdd) throws Exception { Long count = rdd.count(); // System.out.println( "count : " + count ); Double throughput = (count.doubleValue() * 1000 / batch_duration.doubleValue()); System.out.println("Current rate = " + throughput + " records / second"); XMemcachedClientBuilder builder = new XMemcachedClientBuilder(AddrUtil.getAddresses("localhost:11211")); XMemcachedClient client = (XMemcachedClient) builder.build(); client.setPrimitiveAsString(true); Long currentTimeStamp = System.currentTimeMillis(); // System.out.println("End time: " + currentTimeStamp); client.add(currentTimeStamp.toString(), 0, throughput); return null; } }); jssc.start(); // Start the computation jssc.awaitTermination(); // Wait for the computation to terminate }
public static void main(String[] args) { if (args.length < 4) { System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount"); // sparkConf.setMaster("spark://60f81dc6426c:7077"); // SparkConf sparkConf = new // SparkConf().setAppName("JavaKafkaWordCount").setMaster("spark://60f81dc6426c:7077"); // Create the context with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext("local[4]", "JavaKafkaWordCount", new Duration(2000)); int numThreads = Integer.parseInt(args[3]); Logger.getLogger("org").setLevel(Level.OFF); Logger.getLogger("akka").setLevel(Level.OFF); Map<String, Integer> topicMap = new HashMap<String, Integer>(); String[] topics = args[2].split(","); for (String topic : topics) { topicMap.put(topic, numThreads); } /* for(String t: topic) { topicMap.put(t, new Integer(3)); }*/ // NotSerializable notSerializable = new NotSerializable(); // JavaRDD<String> rdd = sc.textFile("/tmp/myfile"); // rdd.map(s -> notSerializable.doSomething(s)).collect(); JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, args[0], args[1], topicMap); // JavaPairReceiverInputDStream<String, String> kafkaStream = // KafkaUtils.createStream(jssc, "localhost:2181","streamingContext", // topicMap); System.out.println("Connection !!!!"); /*JavaDStream<String> data = messages.map(new Function<Tuple2<String, String>, String>() { public String call(Tuple2<String, String> message) { return message._2(); } } );*/ JavaDStream<String> lines = messages.map( new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Lists.newArrayList(SPACE.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words .mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }) .reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD( new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map( new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Register as table wordsDataFrame.createOrReplaceTempView("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
public static void main(String[] args) { // Create the context with a 10 second batch size SparkConf sparkConf = new SparkConf().setAppName("Assignment"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(10000)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream( "localhost", Integer.parseInt("9999"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { List<String> allMatches = new ArrayList<String>(); Matcher matcher = SPACE.matcher(x); while (matcher.find()) { allMatches.add(matcher.group().toLowerCase()); } return Lists.newArrayList(allMatches.toArray(new String[0])); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); // Reduce function adding two integers, defined separately for clarity Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) throws Exception { return i1 + i2; } }; JavaPairDStream<String, Integer> windowedWordCounts = wordCounts.reduceByKeyAndWindow(reduceFunc, new Duration(30000), new Duration(10000)); windowedWordCounts.print(); ssc.start(); ssc.awaitTermination(); }
public void run() { System.setProperty("spark.hadoop.dfs.replication", "2"); Logger.getLogger("org").setLevel(Level.OFF); Logger.getLogger("akka").setLevel(Level.OFF); SparkConf conf = new SparkConf().setAppName("WindowingKafkaWordCountWithFaultTolerance"); conf.set("spark.master", PropertiesStack.getProperty("spark.master")); conf.set("spark.executor.memory", PropertiesStack.getProperty("spark.executor.memory")); conf.set("spark.driver.memory", PropertiesStack.getProperty("spark.driver.memory")); conf.set( "spark.driver.maxResultSize", PropertiesStack.getProperty("spark.driver.maxResultSize")); // .setAppName("WindowingKafkaWordCountWithoutFaultTolerance"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10)); HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(PropertiesStack.getKafkaTopic())); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", PropertiesStack.getKafkaBootstrapServers()); kafkaParams.put("zookeeper.connect", PropertiesStack.getZookeeperConnect()); kafkaParams.put("auto.offset.reset", "smallest"); kafkaParams.put("group.id", PropertiesStack.getKafkaGroupId()); kafkaParams.put("auto.commit.enable", "false"); Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put(PropertiesStack.getKafkaTopic(), 1); // Map<kafka.common.TopicAndPartition, java.lang.Long> fromOffsets = new HashMap<>(); // fromOffsets.put(new TopicAndPartition(PropertiesStack.getKafkaTopic(), // 1), 1000L); // Create direct kafka stream with brokers and topics // JavaInputDStream<String> messages = KafkaUtils // .createDirectStream( // jssc, // String.class, // String.class, // StringDecoder.class, // StringDecoder.class, // String.class, // kafkaParams, // fromOffsets, // new Function<kafka.message.MessageAndMetadata<String, String>, String>() { // @Override // public String call( // MessageAndMetadata<String, String> v1) // throws Exception { // return v1.message(); // } // }); JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.count().print(); // .createStream(jssc, PropertiesStack.getZookeeperConnect(), // PropertiesStack.getKafkaGroupId(), topicMap); // Start the computation jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println( "Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n"); System.exit(1); } String brokers = args[0]; String topics = args[1]; // Create context with a 2 seconds batch interval SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); // Get the lines, split them into words, count the words and print JavaDStream<String> lines = messages.map( new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Arrays.asList(SPACE.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words .mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }) .reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); // Start the computation jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) { if (args.length < 5) { System.out.println( "Usage: StreamProducerJava <infinispan_host> <twitter4j.oauth.consumerKey> <twitter4j.oauth.consumerSecret> <twitter4j.oauth.accessToken> <twitter4j.oauth.accessTokenSecret>"); System.exit(1); } String infinispanHost = args[0]; System.setProperty("twitter4j.oauth.consumerKey", args[1]); System.setProperty("twitter4j.oauth.consumerSecret", args[2]); System.setProperty("twitter4j.oauth.accessToken", args[3]); System.setProperty("twitter4j.oauth.accessTokenSecret", args[4]); // Reduce the log level in the driver Logger.getLogger("org").setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("spark-infinispan-stream-producer-java"); // Create the streaming context JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Seconds.apply(1)); // Populate infinispan properties Properties infinispanProperties = new Properties(); infinispanProperties.put("infinispan.client.hotrod.server_list", infinispanHost); JavaReceiverInputDStream<Status> twitterDStream = TwitterUtils.createStream(javaStreamingContext); // Transform from twitter4j.Status to our domain model org.infinispan.spark.demo.twitter.Tweet JavaDStream<Tuple2<Long, Tweet>> kvPair = twitterDStream.map( status -> new Tuple2<>( status.getId(), new Tweet( status.getId(), status.getUser().getScreenName(), Optional.ofNullable(status.getPlace()) .map(Place::getCountry) .orElseGet(() -> "N/A"), status.getRetweetCount(), status.getText()))); // Write the stream to infinispan InfinispanJavaDStream.writeToInfinispan(kvPair, infinispanProperties); // Create InfinispanInputDStream JavaInputDStream<Tuple3<Long, Tweet, ClientEvent.Type>> infinispanInputDStream = InfinispanJavaDStream.createInfinispanInputDStream( javaStreamingContext, MEMORY_ONLY(), infinispanProperties); // Apply a transformation to the RDDs to aggregate by country JavaPairDStream<String, Integer> countryDStream = infinispanInputDStream.transformToPair( rdd -> { return rdd.filter(ev -> !ev._2().getCountry().equals("N/A")) .mapToPair(event -> new Tuple2<>(event._2().getCountry(), 1)) .reduceByKey((a, b) -> a + b); }); // Since we are interested in the last 60 seconds only, we restrict the DStream by window, // collapsing all the RDDs: JavaPairDStream<String, Integer> lastMinuteStream = countryDStream.reduceByKeyAndWindow((a, b) -> a + b, new Duration(60 * 1000)); lastMinuteStream.foreachRDD( (rdd, time) -> { System.out.format("---------- %s ----------\n", time.toString()); List<Tuple2<String, Integer>> results = rdd.collect(); results .stream() .sorted((o1, o2) -> o2._2().compareTo(o1._2())) .forEach(t -> System.out.format("[%s,%d]\n", t._1(), t._2())); return null; }); // Start the processing javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }
public static void main(String[] args) { Logger logger = Logger.getRootLogger(); logger.setLevel(Level.OFF); String consumerKey = "JqQ1lAWg90PVD9U8XoDWedCm8"; String consumerSecret = "QaUe7V9HuYQvC031MVqpUuuP2OjieI0BBDEHLpFOR221zjQ0xp"; String accessToken = "3299869044-UVd8CwTfnDgcGFGPro2yGXKWhArKtXRxC6iekmH"; String accessTokenSecret = "3XtGQi1naI1V9wCVs2aQgEeVWr65vXDczOwGvqa3iGlEG"; System.setProperty("twitter4j.oauth.consumerKey", consumerKey); System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret); System.setProperty("twitter4j.oauth.accessToken", accessToken); System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret); String[] filters = {"bulling", "bullied", "bulling", "bullyed", "bully", "teased"}; SparkConf sparkConf = new SparkConf().setAppName("bullyhunter"); System.out.println("Started bullyhunter..."); JavaStreamingContext sc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(sc, filters); JavaDStream<String> text = stream.map( new Function<Status, String>() { public String call(Status status) { // String msg = status.getText(); // String filtered_msg = Enrichment.filter(msg); // if (filtered_msg == null) { // return null; // } // TweetRecord tr = new TweetRecord(); // tr.setMsg(filtered_msg); // //tr.setGeo(status.getGeoLocation().getLatitude()); // String fullName = status.getPlace().getFullName(); // if (fullName == null) // return null; // String[] fields = fullName.spilt(DELIMITER); // tr.setCity(fullName.split()); String msg = status.getText(); double ind = Classification.classifyTweet(msg); if (ind > 0) { return status.getText(); } else { return null; } } }); // text = text.filter(new Function<String, Boolean>() { // public Boolean call(String msg) { // boolean containKeyword = false; // String lowerCase = msg.toLowerCase(); // for (String k : keywords) // if (lowerCase.contains(k)) { // containKeyword = true; // break; // } // if (containKeyword == true && lowerCase.contains("bull") // && !lowerCase.contains("RT")) { // return true; // } // return false; // } // // }); text = text.filter( new Function<String, Boolean>() { public Boolean call(String msg) { return (msg == null) ? false : true; } }); text.print(); sc.start(); sc.awaitTermination(); }
@Test public void testKafkaStream() throws InterruptedException { final String topic1 = "topic1"; final String topic2 = "topic2"; // hold a reference to the current offset ranges, so it can be used downstream final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>(); String[] topic1data = createTopicAndSendData(topic1); String[] topic2data = createTopicAndSendData(topic2); Set<String> sent = new HashSet<>(); sent.addAll(Arrays.asList(topic1data)); sent.addAll(Arrays.asList(topic2data)); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress()); kafkaParams.put("auto.offset.reset", "smallest"); JavaDStream<String> stream1 = KafkaUtils.createDirectStream( ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicToSet(topic1)) .transformToPair( // Make sure you can get offset ranges from the rdd new Function<JavaPairRDD<String, String>, JavaPairRDD<String, String>>() { @Override public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) { OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges(); offsetRanges.set(offsets); Assert.assertEquals(topic1, offsets[0].topic()); return rdd; } }) .map( new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> kv) { return kv._2(); } }); JavaDStream<String> stream2 = KafkaUtils.createDirectStream( ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, String.class, kafkaParams, topicOffsetToMap(topic2, 0L), new Function<MessageAndMetadata<String, String>, String>() { @Override public String call(MessageAndMetadata<String, String> msgAndMd) { return msgAndMd.message(); } }); JavaDStream<String> unifiedStream = stream1.union(stream2); final Set<String> result = Collections.synchronizedSet(new HashSet<String>()); unifiedStream.foreachRDD( new VoidFunction<JavaRDD<String>>() { @Override public void call(JavaRDD<String> rdd) { result.addAll(rdd.collect()); for (OffsetRange o : offsetRanges.get()) { System.out.println( o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset()); } } }); ssc.start(); long startTime = System.currentTimeMillis(); boolean matches = false; while (!matches && System.currentTimeMillis() - startTime < 20000) { matches = sent.size() == result.size(); Thread.sleep(50); } Assert.assertEquals(sent, result); ssc.stop(); }