public static void main(String[] args) { SparkConf conf = new SparkConf() .setMaster("local[4]") .setAppName("SparkStreamingPullDataFromFlume for Java"); JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(30)); // JavaReceiverInputDStream<SparkFlumeEvent> lines = FlumeUtils.createStream(jsc,"master1", // 9999); flume push data to Spark Streaming JavaReceiverInputDStream<SparkFlumeEvent> lines = FlumeUtils.createPollingStream( jsc, "master1", 9999); // Spark Streaming pull data from flume JavaDStream<String> words = lines.flatMap( new FlatMapFunction<SparkFlumeEvent, String>() { private static final long serialVersionUID = 1L; @Override public Iterable<String> call(SparkFlumeEvent event) throws Exception { String line = new String(event.event().getBody().array()); return Arrays.asList(line.split(" ")); } }); JavaPairDStream<String, Integer> pairs = words.mapToPair( new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<String, Integer>(word, 1); } }); JavaPairDStream<String, Integer> wordsCount = pairs.reduceByKey( new Function2< Integer, Integer, Integer>() { // 对相同的Key,进行Value的累计(包括Local和Reducer级别同时Reduce) private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); wordsCount.print(); jsc.start(); jsc.awaitTermination(); jsc.close(); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD( new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map( new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Register as table wordsDataFrame.createOrReplaceTempView("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
public static void main(String[] args) { if (args.length < 5) { System.out.println( "Usage: StreamProducerJava <infinispan_host> <twitter4j.oauth.consumerKey> <twitter4j.oauth.consumerSecret> <twitter4j.oauth.accessToken> <twitter4j.oauth.accessTokenSecret>"); System.exit(1); } String infinispanHost = args[0]; System.setProperty("twitter4j.oauth.consumerKey", args[1]); System.setProperty("twitter4j.oauth.consumerSecret", args[2]); System.setProperty("twitter4j.oauth.accessToken", args[3]); System.setProperty("twitter4j.oauth.accessTokenSecret", args[4]); // Reduce the log level in the driver Logger.getLogger("org").setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("spark-infinispan-stream-producer-java"); // Create the streaming context JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Seconds.apply(1)); // Populate infinispan properties Properties infinispanProperties = new Properties(); infinispanProperties.put("infinispan.client.hotrod.server_list", infinispanHost); JavaReceiverInputDStream<Status> twitterDStream = TwitterUtils.createStream(javaStreamingContext); // Transform from twitter4j.Status to our domain model org.infinispan.spark.demo.twitter.Tweet JavaDStream<Tuple2<Long, Tweet>> kvPair = twitterDStream.map( status -> new Tuple2<>( status.getId(), new Tweet( status.getId(), status.getUser().getScreenName(), Optional.ofNullable(status.getPlace()) .map(Place::getCountry) .orElseGet(() -> "N/A"), status.getRetweetCount(), status.getText()))); // Write the stream to infinispan InfinispanJavaDStream.writeToInfinispan(kvPair, infinispanProperties); // Create InfinispanInputDStream JavaInputDStream<Tuple3<Long, Tweet, ClientEvent.Type>> infinispanInputDStream = InfinispanJavaDStream.createInfinispanInputDStream( javaStreamingContext, MEMORY_ONLY(), infinispanProperties); // Apply a transformation to the RDDs to aggregate by country JavaPairDStream<String, Integer> countryDStream = infinispanInputDStream.transformToPair( rdd -> { return rdd.filter(ev -> !ev._2().getCountry().equals("N/A")) .mapToPair(event -> new Tuple2<>(event._2().getCountry(), 1)) .reduceByKey((a, b) -> a + b); }); // Since we are interested in the last 60 seconds only, we restrict the DStream by window, // collapsing all the RDDs: JavaPairDStream<String, Integer> lastMinuteStream = countryDStream.reduceByKeyAndWindow((a, b) -> a + b, new Duration(60 * 1000)); lastMinuteStream.foreachRDD( (rdd, time) -> { System.out.format("---------- %s ----------\n", time.toString()); List<Tuple2<String, Integer>> results = rdd.collect(); results .stream() .sorted((o1, o2) -> o2._2().compareTo(o1._2())) .forEach(t -> System.out.format("[%s,%d]\n", t._1(), t._2())); return null; }); // Start the processing javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }
public static void main(String[] args) { String master = System.getenv("MASTER"); if (master == null) { master = "local[2]"; } SparkConf conf = new SparkConf().setAppName("Voter Application").setMaster(master); Logger.getLogger("org").setLevel(Level.ERROR); Logger.getLogger("akka").setLevel(Level.ERROR); final Long batch_duration = Long.valueOf(args[0]); JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(Integer.valueOf(args[0]))); jssc.checkpoint("."); JavaReceiverInputDStream<String> votes = jssc.receiverStream(new Voter("localhost", 6789)); // transform text line stream to PhoneCall stream JavaDStream<PhoneCall> phoneCalls = votes.map( new Function<String, PhoneCall>() { public PhoneCall call(String s) { return getPhoneCall(s); } }); JavaDStream<Long> counts = votes.count(); counts.print(); // create updateFunction which is used to update the total call count for each phone number Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction = new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() { public Optional<Integer> call(List<Integer> values, Optional<Integer> state) { // add the new values with the previous running count to get the // new count Integer sum = 0; for (Integer i : values) { sum += i; } Integer newSum = sum + state.or(0); return Optional.of(newSum); } }; // JavaPairDStream<Long, Integer> calls = phoneCalls.mapToPair( new PairFunction<PhoneCall, Long, Integer>() { public Tuple2<Long, Integer> call(PhoneCall x) { return new Tuple2<Long, Integer>(x.phoneNumber, 1); } }); // generate the accumulated count for phone numbers final JavaPairDStream<Long, Integer> callNumberCounts = calls.updateStateByKey(updateFunction); // callNumberCounts.print(); JavaPairDStream<Long, PhoneCall> pairVotes = phoneCalls.mapToPair( new PairFunction<PhoneCall, Long, PhoneCall>() { public Tuple2<Long, PhoneCall> call(PhoneCall call) throws Exception { return new Tuple2<Long, PhoneCall>(call.voteId, call); } }); // generate the validate phone numbers, which is still allowed to send vote JavaPairDStream<Long, Integer> allowedCalls = callNumberCounts.filter( new Function<Tuple2<Long, Integer>, Boolean>() { public Boolean call(Tuple2<Long, Integer> v1) throws Exception { if (v1._2() > Voter.MAX_VOTES) return false; return true; } }); // allowedCalls.print(); // get validate contestant phone calls JavaDStream<PhoneCall> validContestantPhoneCalls = phoneCalls.filter( new Function<PhoneCall, Boolean>() { public Boolean call(PhoneCall call) { if (call.contestantNumber > Voter.NUM_CONTESTANTS) return false; return true; } }); JavaPairDStream<Long, PhoneCall> anotherTemporyPhoneCalls = validContestantPhoneCalls.mapToPair( new PairFunction<PhoneCall, Long, PhoneCall>() { public Tuple2<Long, PhoneCall> call(PhoneCall x) { return new Tuple2<Long, PhoneCall>(x.phoneNumber, x); } }); // get validate phone call records JavaPairDStream<Long, Tuple2<PhoneCall, Integer>> validatePhoneCalls = anotherTemporyPhoneCalls.join(allowedCalls); // validatePhoneCalls.print(); JavaDStream<PhoneCall> validateCalls = validatePhoneCalls.transform( new Function<JavaPairRDD<Long, Tuple2<PhoneCall, Integer>>, JavaRDD<PhoneCall>>() { public JavaRDD<PhoneCall> call(JavaPairRDD<Long, Tuple2<PhoneCall, Integer>> v1) throws Exception { JavaRDD<PhoneCall> item = v1.map( new Function<Tuple2<Long, Tuple2<PhoneCall, Integer>>, PhoneCall>() { public PhoneCall call(Tuple2<Long, Tuple2<PhoneCall, Integer>> validItem) throws Exception { return validItem._2()._1(); } }); return item; } }); // validateCalls.print(); // save all votes with redis validateCalls.foreachRDD( new Function<JavaRDD<PhoneCall>, Void>() { public Void call(JavaRDD<PhoneCall> rdd) throws Exception { rdd.foreach( new VoidFunction<PhoneCall>() { public void call(PhoneCall call) throws Exception { // System.out.println(call.toString()); String key = String.valueOf(call.voteId); String value = call.getContent(); // save <key,value> using redis JedisPool pool = new JedisPool(new JedisPoolConfig(), "localhost"); Jedis jedis = pool.getResource(); try { jedis.set(key, value); } finally { if (null != jedis) { jedis.close(); } } /// ... when closing your application: pool.destroy(); } }); return null; } }); // validate calls JavaPairDStream<Integer, Integer> contestantVotes = validateCalls.mapToPair( new PairFunction<PhoneCall, Integer, Integer>() { public Tuple2<Integer, Integer> call(PhoneCall x) { return new Tuple2<Integer, Integer>(x.contestantNumber, 1); } }); // use window to get generate leaderboard Integer size = Integer.valueOf(args[1]); Integer slide = Integer.valueOf(args[2]); JavaDStream<PhoneCall> windowCalls = validateCalls.window(new Duration(size), new Duration(slide)); // windowCalls.print(); // generate window contestant count JavaPairDStream<Integer, Integer> windowContestantNums = windowCalls.mapToPair( new PairFunction<PhoneCall, Integer, Integer>() { public Tuple2<Integer, Integer> call(PhoneCall x) { return new Tuple2<Integer, Integer>(x.contestantNumber, 1); } }); JavaPairDStream<Integer, Integer> windContestantCounts = windowContestantNums.reduceByKey( new Function2<Integer, Integer, Integer>() { public Integer call(Integer i1, Integer i2) throws Exception { return i1 + i2; } }); windContestantCounts.print(); // generate the accumulated count for contestants JavaPairDStream<Integer, Integer> totalContestantCounts = contestantVotes.updateStateByKey(updateFunction); // used for sorting PairFunction<Tuple2<Integer, Integer>, Integer, Integer> swapFunction = new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) { return in.swap(); } }; JavaPairDStream<Integer, Integer> swappedTotalContestantCounts = totalContestantCounts.mapToPair(swapFunction); JavaPairDStream<Integer, Integer> sortedTotalContestantCounts = swappedTotalContestantCounts.transformToPair( new Function<JavaPairRDD<Integer, Integer>, JavaPairRDD<Integer, Integer>>() { public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in) throws Exception { return in.sortByKey(false); } }); sortedTotalContestantCounts.print(); // make some statistics phoneCalls.foreachRDD( new Function<JavaRDD<PhoneCall>, Void>() { public Void call(JavaRDD<PhoneCall> rdd) throws Exception { Long count = rdd.count(); // System.out.println( "count : " + count ); Double throughput = (count.doubleValue() * 1000 / batch_duration.doubleValue()); System.out.println("Current rate = " + throughput + " records / second"); XMemcachedClientBuilder builder = new XMemcachedClientBuilder(AddrUtil.getAddresses("localhost:11211")); XMemcachedClient client = (XMemcachedClient) builder.build(); client.setPrimitiveAsString(true); Long currentTimeStamp = System.currentTimeMillis(); // System.out.println("End time: " + currentTimeStamp); client.add(currentTimeStamp.toString(), 0, throughput); return null; } }); jssc.start(); // Start the computation jssc.awaitTermination(); // Wait for the computation to terminate }
public static void main(String[] args) { // Create the context with a 10 second batch size SparkConf sparkConf = new SparkConf().setAppName("Assignment"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(10000)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream( "localhost", Integer.parseInt("9999"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { List<String> allMatches = new ArrayList<String>(); Matcher matcher = SPACE.matcher(x); while (matcher.find()) { allMatches.add(matcher.group().toLowerCase()); } return Lists.newArrayList(allMatches.toArray(new String[0])); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); // Reduce function adding two integers, defined separately for clarity Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) throws Exception { return i1 + i2; } }; JavaPairDStream<String, Integer> windowedWordCounts = wordCounts.reduceByKeyAndWindow(reduceFunc, new Duration(30000), new Duration(10000)); windowedWordCounts.print(); ssc.start(); ssc.awaitTermination(); }
public static void main(String[] args) { Logger logger = Logger.getRootLogger(); logger.setLevel(Level.OFF); String consumerKey = "JqQ1lAWg90PVD9U8XoDWedCm8"; String consumerSecret = "QaUe7V9HuYQvC031MVqpUuuP2OjieI0BBDEHLpFOR221zjQ0xp"; String accessToken = "3299869044-UVd8CwTfnDgcGFGPro2yGXKWhArKtXRxC6iekmH"; String accessTokenSecret = "3XtGQi1naI1V9wCVs2aQgEeVWr65vXDczOwGvqa3iGlEG"; System.setProperty("twitter4j.oauth.consumerKey", consumerKey); System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret); System.setProperty("twitter4j.oauth.accessToken", accessToken); System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret); String[] filters = {"bulling", "bullied", "bulling", "bullyed", "bully", "teased"}; SparkConf sparkConf = new SparkConf().setAppName("bullyhunter"); System.out.println("Started bullyhunter..."); JavaStreamingContext sc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(sc, filters); JavaDStream<String> text = stream.map( new Function<Status, String>() { public String call(Status status) { // String msg = status.getText(); // String filtered_msg = Enrichment.filter(msg); // if (filtered_msg == null) { // return null; // } // TweetRecord tr = new TweetRecord(); // tr.setMsg(filtered_msg); // //tr.setGeo(status.getGeoLocation().getLatitude()); // String fullName = status.getPlace().getFullName(); // if (fullName == null) // return null; // String[] fields = fullName.spilt(DELIMITER); // tr.setCity(fullName.split()); String msg = status.getText(); double ind = Classification.classifyTweet(msg); if (ind > 0) { return status.getText(); } else { return null; } } }); // text = text.filter(new Function<String, Boolean>() { // public Boolean call(String msg) { // boolean containKeyword = false; // String lowerCase = msg.toLowerCase(); // for (String k : keywords) // if (lowerCase.contains(k)) { // containKeyword = true; // break; // } // if (containKeyword == true && lowerCase.contains("bull") // && !lowerCase.contains("RT")) { // return true; // } // return false; // } // // }); text = text.filter( new Function<String, Boolean>() { public Boolean call(String msg) { return (msg == null) ? false : true; } }); text.print(); sc.start(); sc.awaitTermination(); }