public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Example").setMaster("local[2]"); // Create a StreamingContext with a 1-second batch size from a SparkConf JavaStreamingContext jssc = new JavaStreamingContext( conf, Durations.seconds(10)); // Create a DStream from all the input on port 7777 JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777); // Filter our DStream for lines with "error" // Split up into words. final JavaDStream<String> wordDStream = lines.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { return Arrays.asList(x.split(" ")); } }); final JavaPairDStream<String, Integer> wordPairDStream = wordDStream.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s, 1); } }); final JavaPairDStream<String, Integer> totalWordPairDStream = wordPairDStream.reduceByKeyAndWindow( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer a, Integer b) throws Exception { return a + b; } }, Durations.seconds(30), Durations.seconds(20)); // totalWordPairDStream.foreach(new Function2<JavaPairRDD<String, Integer>, Time, Void>() // { // @Override // public Void call(JavaPairRDD<String, Integer> wordPairRDD, Time time) throws // Exception { // // final List<Tuple2<String, Integer>> collect = wordPairRDD.collect(); // for (Tuple2<String, Integer> t : collect) { // System.out.println("the value is t" + t._1() + "," + t._2()); // } // // return null; // } // }); totalWordPairDStream.print(); // Start our streaming context and wait for it to "finish" jssc.start(); // Wait for the job to finish jssc.awaitTermination(); }
public static void main(String[] args) { SparkConf conf = new SparkConf() .setMaster("local[4]") .setAppName("SparkStreamingPullDataFromFlume for Java"); JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(30)); // JavaReceiverInputDStream<SparkFlumeEvent> lines = FlumeUtils.createStream(jsc,"master1", // 9999); flume push data to Spark Streaming JavaReceiverInputDStream<SparkFlumeEvent> lines = FlumeUtils.createPollingStream( jsc, "master1", 9999); // Spark Streaming pull data from flume JavaDStream<String> words = lines.flatMap( new FlatMapFunction<SparkFlumeEvent, String>() { private static final long serialVersionUID = 1L; @Override public Iterable<String> call(SparkFlumeEvent event) throws Exception { String line = new String(event.event().getBody().array()); return Arrays.asList(line.split(" ")); } }); JavaPairDStream<String, Integer> pairs = words.mapToPair( new PairFunction<String, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<String, Integer>(word, 1); } }); JavaPairDStream<String, Integer> wordsCount = pairs.reduceByKey( new Function2< Integer, Integer, Integer>() { // 对相同的Key,进行Value的累计(包括Local和Reducer级别同时Reduce) private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); wordsCount.print(); jsc.start(); jsc.awaitTermination(); jsc.close(); }
public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setAppName("faebookStream"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(15)); JavaDStream<String> stream = jssc.textFileStream("/Users/sboynenpalli/Desktop/shashankOfficeMackbook/sparkinputfolder"); stream.print(10); ProcessVideoStreamData processData = new ProcessVideoStreamData(); processData.processData(stream); jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD( new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map( new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Register as table wordsDataFrame.createOrReplaceTempView("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
public void run() { System.setProperty("spark.hadoop.dfs.replication", "2"); Logger.getLogger("org").setLevel(Level.OFF); Logger.getLogger("akka").setLevel(Level.OFF); SparkConf conf = new SparkConf().setAppName("WindowingKafkaWordCountWithFaultTolerance"); conf.set("spark.master", PropertiesStack.getProperty("spark.master")); conf.set("spark.executor.memory", PropertiesStack.getProperty("spark.executor.memory")); conf.set("spark.driver.memory", PropertiesStack.getProperty("spark.driver.memory")); conf.set( "spark.driver.maxResultSize", PropertiesStack.getProperty("spark.driver.maxResultSize")); // .setAppName("WindowingKafkaWordCountWithoutFaultTolerance"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10)); HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(PropertiesStack.getKafkaTopic())); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", PropertiesStack.getKafkaBootstrapServers()); kafkaParams.put("zookeeper.connect", PropertiesStack.getZookeeperConnect()); kafkaParams.put("auto.offset.reset", "smallest"); kafkaParams.put("group.id", PropertiesStack.getKafkaGroupId()); kafkaParams.put("auto.commit.enable", "false"); Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put(PropertiesStack.getKafkaTopic(), 1); // Map<kafka.common.TopicAndPartition, java.lang.Long> fromOffsets = new HashMap<>(); // fromOffsets.put(new TopicAndPartition(PropertiesStack.getKafkaTopic(), // 1), 1000L); // Create direct kafka stream with brokers and topics // JavaInputDStream<String> messages = KafkaUtils // .createDirectStream( // jssc, // String.class, // String.class, // StringDecoder.class, // StringDecoder.class, // String.class, // kafkaParams, // fromOffsets, // new Function<kafka.message.MessageAndMetadata<String, String>, String>() { // @Override // public String call( // MessageAndMetadata<String, String> v1) // throws Exception { // return v1.message(); // } // }); JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.count().print(); // .createStream(jssc, PropertiesStack.getZookeeperConnect(), // PropertiesStack.getKafkaGroupId(), topicMap); // Start the computation jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println( "Usage: JavaDirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n" + " <topics> is a list of one or more kafka topics to consume from\n\n"); System.exit(1); } String brokers = args[0]; String topics = args[1]; // Create context with a 2 seconds batch interval SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); // Create direct kafka stream with brokers and topics JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); // Get the lines, split them into words, count the words and print JavaDStream<String> lines = messages.map( new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Arrays.asList(SPACE.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words .mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }) .reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); // Start the computation jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) { Logger logger = Logger.getRootLogger(); logger.setLevel(Level.OFF); String consumerKey = "JqQ1lAWg90PVD9U8XoDWedCm8"; String consumerSecret = "QaUe7V9HuYQvC031MVqpUuuP2OjieI0BBDEHLpFOR221zjQ0xp"; String accessToken = "3299869044-UVd8CwTfnDgcGFGPro2yGXKWhArKtXRxC6iekmH"; String accessTokenSecret = "3XtGQi1naI1V9wCVs2aQgEeVWr65vXDczOwGvqa3iGlEG"; System.setProperty("twitter4j.oauth.consumerKey", consumerKey); System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret); System.setProperty("twitter4j.oauth.accessToken", accessToken); System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret); String[] filters = {"bulling", "bullied", "bulling", "bullyed", "bully", "teased"}; SparkConf sparkConf = new SparkConf().setAppName("bullyhunter"); System.out.println("Started bullyhunter..."); JavaStreamingContext sc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(sc, filters); JavaDStream<String> text = stream.map( new Function<Status, String>() { public String call(Status status) { // String msg = status.getText(); // String filtered_msg = Enrichment.filter(msg); // if (filtered_msg == null) { // return null; // } // TweetRecord tr = new TweetRecord(); // tr.setMsg(filtered_msg); // //tr.setGeo(status.getGeoLocation().getLatitude()); // String fullName = status.getPlace().getFullName(); // if (fullName == null) // return null; // String[] fields = fullName.spilt(DELIMITER); // tr.setCity(fullName.split()); String msg = status.getText(); double ind = Classification.classifyTweet(msg); if (ind > 0) { return status.getText(); } else { return null; } } }); // text = text.filter(new Function<String, Boolean>() { // public Boolean call(String msg) { // boolean containKeyword = false; // String lowerCase = msg.toLowerCase(); // for (String k : keywords) // if (lowerCase.contains(k)) { // containKeyword = true; // break; // } // if (containKeyword == true && lowerCase.contains("bull") // && !lowerCase.contains("RT")) { // return true; // } // return false; // } // // }); text = text.filter( new Function<String, Boolean>() { public Boolean call(String msg) { return (msg == null) ? false : true; } }); text.print(); sc.start(); sc.awaitTermination(); }