public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Example").setMaster("local[2]");
    // Create a StreamingContext with a 1-second batch size from a SparkConf
    JavaStreamingContext jssc =
        new JavaStreamingContext(
            conf, Durations.seconds(10)); // Create a DStream from all the input on port 7777
    JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777);
    // Filter our DStream for lines with "error"

    // Split up into words.
    final JavaDStream<String> wordDStream =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              public Iterable<String> call(String x) {
                return Arrays.asList(x.split(" "));
              }
            });

    final JavaPairDStream<String, Integer> wordPairDStream =
        wordDStream.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    final JavaPairDStream<String, Integer> totalWordPairDStream =
        wordPairDStream.reduceByKeyAndWindow(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer a, Integer b) throws Exception {
                return a + b;
              }
            },
            Durations.seconds(30),
            Durations.seconds(20));

    //        totalWordPairDStream.foreach(new Function2<JavaPairRDD<String, Integer>, Time, Void>()
    // {
    //            @Override
    //            public Void call(JavaPairRDD<String, Integer> wordPairRDD, Time time) throws
    // Exception {
    //
    //                final List<Tuple2<String, Integer>> collect = wordPairRDD.collect();
    //                for (Tuple2<String, Integer> t : collect) {
    //                    System.out.println("the value is t" +  t._1() + "," + t._2());
    //                }
    //
    //                return null;
    //            }
    //        });

    totalWordPairDStream.print();

    // Start our streaming context and wait for it to "finish"
    jssc.start();
    // Wait for the job to finish
    jssc.awaitTermination();
  }
  public static void main(String[] args) {
    if (args.length < 4) {
      System.err.println("Usage: PDCKafkaConsumer <zkQuorum> <group> <topics> <numThreads>");
      System.exit(1);
    }

    String zkQuorum = args[0];
    String kfGrp = args[1];
    String[] topics = args[2].split(",");
    int numThreads = Integer.valueOf(args[3]);

    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    for (String topic : topics) {
      topicMap.put(topic, numThreads);
    }

    SparkConf conf = new SparkConf().setAppName("PDCKafkaConsumer");
    conf.set("spark.ui.port", "4040");
    JavaStreamingContext ctx = new JavaStreamingContext(conf, new Duration(10000));
    JavaPairReceiverInputDStream<String, String> kfStream =
        KafkaUtils.createStream(ctx, zkQuorum, kfGrp, topicMap);
    kfStream.saveAsHadoopFiles(
        "/phasor/pmu/pdc", "in", Text.class, Text.class, TextOutputFormat.class);

    ctx.start();
    ctx.awaitTermination();
  }
 @SuppressWarnings("ConstantConditions")
 JavaDStream<WindowedValue<T>> getDStream() {
   if (dStream == null) {
     WindowedValue.ValueOnlyWindowedValueCoder<T> windowCoder =
         WindowedValue.getValueOnlyCoder(coder);
     // create the DStream from queue
     Queue<JavaRDD<WindowedValue<T>>> rddQueue = new LinkedBlockingQueue<>();
     JavaRDD<WindowedValue<T>> lastRDD = null;
     for (Iterable<T> v : values) {
       Iterable<WindowedValue<T>> windowedValues =
           Iterables.transform(v, WindowingHelpers.<T>windowValueFunction());
       JavaRDD<WindowedValue<T>> rdd =
           jssc.sc()
               .parallelize(CoderHelpers.toByteArrays(windowedValues, windowCoder))
               .map(CoderHelpers.fromByteFunction(windowCoder));
       rddQueue.offer(rdd);
       lastRDD = rdd;
     }
     // create DStream from queue, one at a time,
     // with last as default in case batches repeat (graceful stops for example).
     // if the stream is empty, avoid creating a default empty RDD.
     // mainly for unit test so no reason to have this configurable.
     dStream =
         lastRDD != null
             ? jssc.queueStream(rddQueue, true, lastRDD)
             : jssc.queueStream(rddQueue, true);
   }
   return dStream;
 }
  @Override
  public void Start() {
    jssc.addStreamingListener(new PerformanceStreamingListener());

    //        jssc.checkpoint("/tmp/log-analyzer-streaming");
    jssc.checkpoint("hdfs://master:8020/usr/warehouse/wordcount/checkpoint");
    jssc.start();
    jssc.awaitTermination();
  }
  public static void main(String[] args) {
    SparkConf conf =
        new SparkConf()
            .setMaster("local[4]")
            .setAppName("SparkStreamingPullDataFromFlume for Java");
    JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(30));
    //		JavaReceiverInputDStream<SparkFlumeEvent> lines = FlumeUtils.createStream(jsc,"master1",
    // 9999); flume push data to Spark Streaming
    JavaReceiverInputDStream<SparkFlumeEvent> lines =
        FlumeUtils.createPollingStream(
            jsc, "master1", 9999); // Spark Streaming pull data from flume
    JavaDStream<String> words =
        lines.flatMap(
            new FlatMapFunction<SparkFlumeEvent, String>() {
              private static final long serialVersionUID = 1L;

              @Override
              public Iterable<String> call(SparkFlumeEvent event) throws Exception {
                String line = new String(event.event().getBody().array());
                return Arrays.asList(line.split(" "));
              }
            });

    JavaPairDStream<String, Integer> pairs =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {

              private static final long serialVersionUID = 1L;

              @Override
              public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
              }
            });

    JavaPairDStream<String, Integer> wordsCount =
        pairs.reduceByKey(
            new Function2<
                Integer, Integer, Integer>() { // 对相同的Key,进行Value的累计(包括Local和Reducer级别同时Reduce)

              private static final long serialVersionUID = 1L;

              @Override
              public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
              }
            });

    wordsCount.print();

    jsc.start();

    jsc.awaitTermination();
    jsc.close();
  }
 public static void main(String[] args) throws IOException {
   SparkConf conf = new SparkConf().setAppName("faebookStream");
   JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(15));
   JavaDStream<String> stream =
       jssc.textFileStream("/Users/sboynenpalli/Desktop/shashankOfficeMackbook/sparkinputfolder");
   stream.print(10);
   ProcessVideoStreamData processData = new ProcessVideoStreamData();
   processData.processData(stream);
   jssc.start();
   jssc.awaitTermination();
 }
  public static void main(String[] args) {
    // Create a Spark Context.
    SparkConf conf = new SparkConf().setAppName("Activity").set("spark.eventLog.enabled", "true");
    ;
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaStreamingContext jssc = new JavaStreamingContext(sc, STREAM_INTERVAL);
    String TOPIC = "activityevent";
    String zkQuorum = "localhost:2181";
    String group = "1";
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    topicMap.put(TOPIC, 1);

    JavaPairReceiverInputDStream<String, String> messages =
        KafkaUtils.createStream(jssc, zkQuorum, group, topicMap);
    // messages.print();
    JavaDStream<String> activitydatastream =
        messages.map(
            new Function<Tuple2<String, String>, String>() {
              @Override
              public String call(Tuple2<String, String> tuple2) {
                return tuple2._2();
              }
            });

    final Long teamWindowDurationMs = Durations.minutes(1).milliseconds();
    JavaDStream<Activity> ActivityEntryDStream = activitydatastream.map(Activity::parseFromLine);
    JavaPairDStream<WithTimestamp<String>, Double> ActivityWindowDStream =
        ActivityEntryDStream.mapToPair(
                windows ->
                    new Tuple2<>(
                        WithTimestamp.create(
                            windows.getActivity(),
                            // Apply Fixed Window by rounding the timestamp down to the nearest
                            // multiple of the window size
                            (convertMillsecs(windows.getTimestamp()) / teamWindowDurationMs)
                                * teamWindowDurationMs),
                        windows.getXaxis()))
            .reduceByKey(SUM_REDUCER);

    ActivityWindowDStream.print();

    jssc.start();
    jssc.awaitTermination();
    // jssc.close();
    sc.stop();
    sc.close();
  }
  private static JavaStreamingContext createContext(String input, String checkpointDirectory) {
    System.out.println("Creating new context");
    // final File outputFile = new File("/flume_recover");
    // if (outputFile.exists()) {
    // outputFile.delete();
    // }

    SparkConf conf =
        new SparkConf()
            .setMaster("local[2]")
            .setAppName("Stream File")
            .set("spark.driver.allowMultipleContexts", "true");
    conf.set("spark.serializer", KryoSerializer.class.getName());
    conf.set("es.index.auto.create", "true");
    conf.set("es.nodes", "10.26.1.134:9200");
    conf.set("es.resource", "flume/test");
    conf.set("es.input.json", "true");

    JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(3000));
    jssc.checkpoint(checkpointDirectory);

    JavaDStream<String> textFile = jssc.textFileStream(input);
    JavaDStream<String> jsonStr =
        textFile.map(
            new Function<String, String>() {
              public String call(String arg0) throws Exception {
                Matcher m = log.matcher(arg0);
                if (m.find()) {
                  return transferJson(m);
                }
                return "";
              }
            });
    jsonStr.print();

    jsonStr.foreach(
        new Function<JavaRDD<String>, Void>() {
          public Void call(JavaRDD<String> arg0) throws Exception {
            if (!arg0.isEmpty() && arg0 != null) {
              JavaEsSpark.saveToEs(arg0, "flume/test");
            }
            return null;
          }
        });

    return jssc;
  }
  public static void setJscStartFlag() {

    if (jsscStartFlag) {
      m_jssc.stop();
      jsscStartFlag = false;
      m_jssc = null;
    }
    jscStartFlag = true;
  }
  public static void main(String[] args) {
    if (args.length < 1) {
      System.out.println("Format Error : [File Path]");
    }
    final String filePath = args[0];
    final String checkpointDirectory = "/flume_recover";
    // final String outputPath = "/flume_recover";

    JavaStreamingContextFactory factory =
        new JavaStreamingContextFactory() {
          public JavaStreamingContext create() {
            return createContext(filePath, checkpointDirectory);
          }
        };
    JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDirectory, factory);

    jssc.start();
    jssc.awaitTermination();
  }
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("kafka-sandbox").setMaster("local[*]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));

    Set<String> topics = Collections.singleton("mytopic");
    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", "sandbox.hortonworks.com:6667");

    JavaPairInputDStream<String, byte[]> directKafkaStream =
        KafkaUtils.createDirectStream(
            ssc,
            String.class,
            byte[].class,
            StringDecoder.class,
            DefaultDecoder.class,
            kafkaParams,
            topics);

    directKafkaStream.foreachRDD(
        rdd -> {
          rdd.foreach(
              avroRecord -> {
                Schema.Parser parser = new Schema.Parser();
                Schema schema = parser.parse(AvroVulabProducer.USER_SCHEMA);
                Injection<GenericRecord, byte[]> recordInjection =
                    GenericAvroCodecs.toBinary(schema);
                GenericRecord record = recordInjection.invert(avroRecord._2).get();

                System.out.println(
                    "str1= "
                        + record.get("str1")
                        + ", str2= "
                        + record.get("str2")
                        + ", int1="
                        + record.get("int1"));
              });
        });

    ssc.start();
    ssc.awaitTermination();
  }
  @After
  public void tearDown() {
    if (ssc != null) {
      ssc.stop();
      ssc = null;
    }

    if (kafkaTestUtils != null) {
      kafkaTestUtils.teardown();
      kafkaTestUtils = null;
    }
  }
  public static void main(String[] args) {
    if (args.length != 3) {
      System.err.println("Usage: JavaFlumeEventCount <master> <host> <port>");
      System.exit(1);
    }

    String master = args[0];
    String host = args[1];
    int port = Integer.parseInt(args[2]);

    Duration batchInterval = new Duration(2000);

    JavaStreamingContext ssc =
        new JavaStreamingContext(
            master,
            "FlumeEventCount",
            batchInterval,
            System.getenv("SPARK_HOME"),
            JavaStreamingContext.jarOfClass(JavaFlumeEventCount.class));
    JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, "localhost", port);

    flumeStream.count();

    flumeStream
        .count()
        .map(
            new Function<Long, String>() {
              @Override
              public String call(Long in) {
                return "Received " + in + " flume events.";
              }
            })
        .print();

    ssc.start();
  }
Example #14
0
  public static void main(String[] args) throws Exception {
    if (args.length < 1) {
      System.err.println("Usage: JavaQueueStream <master>");
      System.exit(1);
    }

    StreamingExamples.setStreamingLogLevels();

    // Create the context
    JavaStreamingContext ssc =
        new JavaStreamingContext(
            args[0],
            "QueueStream",
            new Duration(1000),
            System.getenv("SPARK_HOME"),
            JavaStreamingContext.jarOfClass(JavaQueueStream.class));

    // Create the queue through which RDDs can be pushed to
    // a QueueInputDStream
    Queue<JavaRDD<Integer>> rddQueue = new LinkedList<JavaRDD<Integer>>();

    // Create and push some RDDs into the queue
    List<Integer> list = Lists.newArrayList();
    for (int i = 0; i < 1000; i++) {
      list.add(i);
    }

    for (int i = 0; i < 30; i++) {
      rddQueue.add(ssc.sparkContext().parallelize(list));
    }

    // Create the QueueInputDStream and use it do some processing
    JavaDStream<Integer> inputStream = ssc.queueStream(rddQueue);
    JavaPairDStream<Integer, Integer> mappedStream =
        inputStream.mapToPair(
            new PairFunction<Integer, Integer, Integer>() {
              @Override
              public Tuple2<Integer, Integer> call(Integer i) {
                return new Tuple2<Integer, Integer>(i % 10, 1);
              }
            });
    JavaPairDStream<Integer, Integer> reducedStream =
        mappedStream.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    reducedStream.print();
    ssc.start();
    ssc.awaitTermination();
  }
Example #15
0
  public static void main(String[] args) {

    String master = System.getenv("MASTER");
    if (master == null) {
      master = "local[2]";
    }

    SparkConf conf = new SparkConf().setAppName("Voter Application").setMaster(master);

    Logger.getLogger("org").setLevel(Level.ERROR);
    Logger.getLogger("akka").setLevel(Level.ERROR);

    final Long batch_duration = Long.valueOf(args[0]);
    JavaStreamingContext jssc =
        new JavaStreamingContext(conf, new Duration(Integer.valueOf(args[0])));

    jssc.checkpoint(".");

    JavaReceiverInputDStream<String> votes = jssc.receiverStream(new Voter("localhost", 6789));

    // transform text line stream to PhoneCall stream
    JavaDStream<PhoneCall> phoneCalls =
        votes.map(
            new Function<String, PhoneCall>() {
              public PhoneCall call(String s) {
                return getPhoneCall(s);
              }
            });

    JavaDStream<Long> counts = votes.count();
    counts.print();

    // create updateFunction which is used to update the total call count for each phone number
    Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction =
        new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
          public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
            // add the new values with the previous running count to get the
            // new count
            Integer sum = 0;
            for (Integer i : values) {
              sum += i;
            }
            Integer newSum = sum + state.or(0);
            return Optional.of(newSum);
          }
        };

    //
    JavaPairDStream<Long, Integer> calls =
        phoneCalls.mapToPair(
            new PairFunction<PhoneCall, Long, Integer>() {
              public Tuple2<Long, Integer> call(PhoneCall x) {
                return new Tuple2<Long, Integer>(x.phoneNumber, 1);
              }
            });

    // generate the accumulated count for phone numbers
    final JavaPairDStream<Long, Integer> callNumberCounts = calls.updateStateByKey(updateFunction);
    // callNumberCounts.print();

    JavaPairDStream<Long, PhoneCall> pairVotes =
        phoneCalls.mapToPair(
            new PairFunction<PhoneCall, Long, PhoneCall>() {
              public Tuple2<Long, PhoneCall> call(PhoneCall call) throws Exception {
                return new Tuple2<Long, PhoneCall>(call.voteId, call);
              }
            });

    // generate the validate phone numbers, which is still allowed to send vote
    JavaPairDStream<Long, Integer> allowedCalls =
        callNumberCounts.filter(
            new Function<Tuple2<Long, Integer>, Boolean>() {

              public Boolean call(Tuple2<Long, Integer> v1) throws Exception {
                if (v1._2() > Voter.MAX_VOTES) return false;

                return true;
              }
            });

    // allowedCalls.print();

    // get validate contestant phone calls
    JavaDStream<PhoneCall> validContestantPhoneCalls =
        phoneCalls.filter(
            new Function<PhoneCall, Boolean>() {
              public Boolean call(PhoneCall call) {
                if (call.contestantNumber > Voter.NUM_CONTESTANTS) return false;
                return true;
              }
            });

    JavaPairDStream<Long, PhoneCall> anotherTemporyPhoneCalls =
        validContestantPhoneCalls.mapToPair(
            new PairFunction<PhoneCall, Long, PhoneCall>() {
              public Tuple2<Long, PhoneCall> call(PhoneCall x) {
                return new Tuple2<Long, PhoneCall>(x.phoneNumber, x);
              }
            });

    // get validate phone call records
    JavaPairDStream<Long, Tuple2<PhoneCall, Integer>> validatePhoneCalls =
        anotherTemporyPhoneCalls.join(allowedCalls);

    // validatePhoneCalls.print();

    JavaDStream<PhoneCall> validateCalls =
        validatePhoneCalls.transform(
            new Function<JavaPairRDD<Long, Tuple2<PhoneCall, Integer>>, JavaRDD<PhoneCall>>() {
              public JavaRDD<PhoneCall> call(JavaPairRDD<Long, Tuple2<PhoneCall, Integer>> v1)
                  throws Exception {
                JavaRDD<PhoneCall> item =
                    v1.map(
                        new Function<Tuple2<Long, Tuple2<PhoneCall, Integer>>, PhoneCall>() {
                          public PhoneCall call(Tuple2<Long, Tuple2<PhoneCall, Integer>> validItem)
                              throws Exception {
                            return validItem._2()._1();
                          }
                        });
                return item;
              }
            });

    // validateCalls.print();

    // save all votes with redis
    validateCalls.foreachRDD(
        new Function<JavaRDD<PhoneCall>, Void>() {

          public Void call(JavaRDD<PhoneCall> rdd) throws Exception {

            rdd.foreach(
                new VoidFunction<PhoneCall>() {

                  public void call(PhoneCall call) throws Exception {
                    // System.out.println(call.toString());
                    String key = String.valueOf(call.voteId);
                    String value = call.getContent();

                    // save <key,value> using redis
                    JedisPool pool = new JedisPool(new JedisPoolConfig(), "localhost");
                    Jedis jedis = pool.getResource();
                    try {
                      jedis.set(key, value);
                    } finally {
                      if (null != jedis) {
                        jedis.close();
                      }
                    }
                    /// ... when closing your application:
                    pool.destroy();
                  }
                });

            return null;
          }
        });

    // validate calls
    JavaPairDStream<Integer, Integer> contestantVotes =
        validateCalls.mapToPair(
            new PairFunction<PhoneCall, Integer, Integer>() {
              public Tuple2<Integer, Integer> call(PhoneCall x) {
                return new Tuple2<Integer, Integer>(x.contestantNumber, 1);
              }
            });

    // use window to get generate leaderboard
    Integer size = Integer.valueOf(args[1]);
    Integer slide = Integer.valueOf(args[2]);

    JavaDStream<PhoneCall> windowCalls =
        validateCalls.window(new Duration(size), new Duration(slide));
    // windowCalls.print();

    // generate window contestant count
    JavaPairDStream<Integer, Integer> windowContestantNums =
        windowCalls.mapToPair(
            new PairFunction<PhoneCall, Integer, Integer>() {
              public Tuple2<Integer, Integer> call(PhoneCall x) {
                return new Tuple2<Integer, Integer>(x.contestantNumber, 1);
              }
            });
    JavaPairDStream<Integer, Integer> windContestantCounts =
        windowContestantNums.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              public Integer call(Integer i1, Integer i2) throws Exception {

                return i1 + i2;
              }
            });
    windContestantCounts.print();

    // generate the accumulated count for contestants
    JavaPairDStream<Integer, Integer> totalContestantCounts =
        contestantVotes.updateStateByKey(updateFunction);

    // used for sorting
    PairFunction<Tuple2<Integer, Integer>, Integer, Integer> swapFunction =
        new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
          public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) {
            return in.swap();
          }
        };

    JavaPairDStream<Integer, Integer> swappedTotalContestantCounts =
        totalContestantCounts.mapToPair(swapFunction);

    JavaPairDStream<Integer, Integer> sortedTotalContestantCounts =
        swappedTotalContestantCounts.transformToPair(
            new Function<JavaPairRDD<Integer, Integer>, JavaPairRDD<Integer, Integer>>() {

              public JavaPairRDD<Integer, Integer> call(JavaPairRDD<Integer, Integer> in)
                  throws Exception {
                return in.sortByKey(false);
              }
            });

    sortedTotalContestantCounts.print();

    // make some statistics
    phoneCalls.foreachRDD(
        new Function<JavaRDD<PhoneCall>, Void>() {

          public Void call(JavaRDD<PhoneCall> rdd) throws Exception {
            Long count = rdd.count();
            // System.out.println( "count : " + count );
            Double throughput = (count.doubleValue() * 1000 / batch_duration.doubleValue());
            System.out.println("Current rate = " + throughput + " records / second");

            XMemcachedClientBuilder builder =
                new XMemcachedClientBuilder(AddrUtil.getAddresses("localhost:11211"));
            XMemcachedClient client = (XMemcachedClient) builder.build();
            client.setPrimitiveAsString(true);

            Long currentTimeStamp = System.currentTimeMillis();
            // System.out.println("End time: " + currentTimeStamp);
            client.add(currentTimeStamp.toString(), 0, throughput);

            return null;
          }
        });

    jssc.start(); // Start the computation
    jssc.awaitTermination(); // Wait for the computation to terminate
  }
  public static void main(String[] args) {
    if (args.length < 4) {
      System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
      System.exit(1);
    }

    StreamingExamples.setStreamingLogLevels();
    // SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
    // sparkConf.setMaster("spark://60f81dc6426c:7077");
    // SparkConf sparkConf = new
    // SparkConf().setAppName("JavaKafkaWordCount").setMaster("spark://60f81dc6426c:7077");

    // Create the context with a 1 second batch size
    JavaStreamingContext jssc =
        new JavaStreamingContext("local[4]", "JavaKafkaWordCount", new Duration(2000));

    int numThreads = Integer.parseInt(args[3]);
    Logger.getLogger("org").setLevel(Level.OFF);
    Logger.getLogger("akka").setLevel(Level.OFF);
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    String[] topics = args[2].split(",");
    for (String topic : topics) {
      topicMap.put(topic, numThreads);
    }
    /* for(String t: topic)
    {
        topicMap.put(t, new Integer(3));
    }*/
    // NotSerializable notSerializable = new NotSerializable();
    // JavaRDD<String> rdd = sc.textFile("/tmp/myfile");

    // rdd.map(s -> notSerializable.doSomething(s)).collect();
    JavaPairReceiverInputDStream<String, String> messages =
        KafkaUtils.createStream(jssc, args[0], args[1], topicMap);
    // JavaPairReceiverInputDStream<String, String> kafkaStream =
    //   KafkaUtils.createStream(jssc, "localhost:2181","streamingContext",
    //		  topicMap);

    System.out.println("Connection !!!!");
    /*JavaDStream<String> data = messages.map(new Function<Tuple2<String, String>, String>()
    {
        public String call(Tuple2<String, String> message)
        {
            return message._2();
        }
    }
    );*/

    JavaDStream<String> lines =
        messages.map(
            new Function<Tuple2<String, String>, String>() {
              @Override
              public String call(Tuple2<String, String> tuple2) {
                return tuple2._2();
              }
            });

    JavaDStream<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String x) {
                return Lists.newArrayList(SPACE.split(x));
              }
            });

    JavaPairDStream<String, Integer> wordCounts =
        words
            .mapToPair(
                new PairFunction<String, String, Integer>() {
                  @Override
                  public Tuple2<String, Integer> call(String s) {
                    return new Tuple2<String, Integer>(s, 1);
                  }
                })
            .reduceByKey(
                new Function2<Integer, Integer, Integer>() {
                  @Override
                  public Integer call(Integer i1, Integer i2) {
                    return i1 + i2;
                  }
                });

    wordCounts.print();
    jssc.start();
    jssc.awaitTermination();
  }
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
      System.exit(1);
    }

    StreamingExamples.setStreamingLogLevels();

    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount");
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

    // Create a JavaReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited text (eg. generated by 'nc')
    // Note that no duplication in storage level only for running locally.
    // Replication necessary in distributed scenario for fault tolerance.
    JavaReceiverInputDStream<String> lines =
        ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER);
    JavaDStream<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterator<String> call(String x) {
                return Arrays.asList(SPACE.split(x)).iterator();
              }
            });

    // Convert RDDs of the words DStream to DataFrame and run SQL query
    words.foreachRDD(
        new VoidFunction2<JavaRDD<String>, Time>() {
          @Override
          public void call(JavaRDD<String> rdd, Time time) {
            SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

            // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame
            JavaRDD<JavaRecord> rowRDD =
                rdd.map(
                    new Function<String, JavaRecord>() {
                      @Override
                      public JavaRecord call(String word) {
                        JavaRecord record = new JavaRecord();
                        record.setWord(word);
                        return record;
                      }
                    });
            Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class);

            // Register as table
            wordsDataFrame.createOrReplaceTempView("words");

            // Do word count on table using SQL and print it
            Dataset<Row> wordCountsDataFrame =
                spark.sql("select word, count(*) as total from words group by word");
            System.out.println("========= " + time + "=========");
            wordCountsDataFrame.show();
          }
        });

    ssc.start();
    ssc.awaitTermination();
  }
  public static void main(String[] args) {

    // Create the context with a 10 second batch size
    SparkConf sparkConf = new SparkConf().setAppName("Assignment");
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(10000));

    // Create a JavaReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited text (eg. generated by 'nc')
    // Note that no duplication in storage level only for running locally.
    // Replication necessary in distributed scenario for fault tolerance.
    JavaReceiverInputDStream<String> lines =
        ssc.socketTextStream(
            "localhost", Integer.parseInt("9999"), StorageLevels.MEMORY_AND_DISK_SER);

    JavaDStream<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {

              @Override
              public Iterable<String> call(String x) {

                List<String> allMatches = new ArrayList<String>();

                Matcher matcher = SPACE.matcher(x);

                while (matcher.find()) {
                  allMatches.add(matcher.group().toLowerCase());
                }

                return Lists.newArrayList(allMatches.toArray(new String[0]));
              }
            });

    JavaPairDStream<String, Integer> wordCounts =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {

              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    // Reduce function adding two integers, defined separately for clarity
    Function2<Integer, Integer, Integer> reduceFunc =
        new Function2<Integer, Integer, Integer>() {
          @Override
          public Integer call(Integer i1, Integer i2) throws Exception {
            return i1 + i2;
          }
        };

    JavaPairDStream<String, Integer> windowedWordCounts =
        wordCounts.reduceByKeyAndWindow(reduceFunc, new Duration(30000), new Duration(10000));

    windowedWordCounts.print();

    ssc.start();

    ssc.awaitTermination();
  }
  public void run() {

    System.setProperty("spark.hadoop.dfs.replication", "2");

    Logger.getLogger("org").setLevel(Level.OFF);
    Logger.getLogger("akka").setLevel(Level.OFF);

    SparkConf conf = new SparkConf().setAppName("WindowingKafkaWordCountWithFaultTolerance");
    conf.set("spark.master", PropertiesStack.getProperty("spark.master"));
    conf.set("spark.executor.memory", PropertiesStack.getProperty("spark.executor.memory"));
    conf.set("spark.driver.memory", PropertiesStack.getProperty("spark.driver.memory"));
    conf.set(
        "spark.driver.maxResultSize", PropertiesStack.getProperty("spark.driver.maxResultSize"));
    // .setAppName("WindowingKafkaWordCountWithoutFaultTolerance");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10));

    HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(PropertiesStack.getKafkaTopic()));

    HashMap<String, String> kafkaParams = new HashMap<String, String>();
    kafkaParams.put("metadata.broker.list", PropertiesStack.getKafkaBootstrapServers());
    kafkaParams.put("zookeeper.connect", PropertiesStack.getZookeeperConnect());
    kafkaParams.put("auto.offset.reset", "smallest");
    kafkaParams.put("group.id", PropertiesStack.getKafkaGroupId());
    kafkaParams.put("auto.commit.enable", "false");

    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    topicMap.put(PropertiesStack.getKafkaTopic(), 1);
    //		Map<kafka.common.TopicAndPartition, java.lang.Long> fromOffsets = new HashMap<>();
    //		fromOffsets.put(new TopicAndPartition(PropertiesStack.getKafkaTopic(),
    //				1), 1000L);
    // Create direct kafka stream with brokers and topics
    //		JavaInputDStream<String> messages = KafkaUtils
    //				.createDirectStream(
    //						jssc,
    //						String.class,
    //						String.class,
    //						StringDecoder.class,
    //						StringDecoder.class,
    //						String.class,
    //						kafkaParams,
    //						fromOffsets,
    //						new Function<kafka.message.MessageAndMetadata<String, String>, String>() {
    //							@Override
    //							public String call(
    //									MessageAndMetadata<String, String> v1)
    //									throws Exception {
    //								return v1.message();
    //							}
    //						});
    JavaPairInputDStream<String, String> messages =
        KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet);
    messages.count().print();
    // .createStream(jssc, PropertiesStack.getZookeeperConnect(),
    // PropertiesStack.getKafkaGroupId(), topicMap);

    // Start the computation
    jssc.start();
    jssc.awaitTermination();
  }
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println(
          "Usage: JavaDirectKafkaWordCount <brokers> <topics>\n"
              + "  <brokers> is a list of one or more Kafka brokers\n"
              + "  <topics> is a list of one or more kafka topics to consume from\n\n");
      System.exit(1);
    }

    String brokers = args[0];
    String topics = args[1];

    // Create context with a 2 seconds batch interval
    SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));

    Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", brokers);

    // Create direct kafka stream with brokers and topics
    JavaPairInputDStream<String, String> messages =
        KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet);

    // Get the lines, split them into words, count the words and print
    JavaDStream<String> lines =
        messages.map(
            new Function<Tuple2<String, String>, String>() {
              @Override
              public String call(Tuple2<String, String> tuple2) {
                return tuple2._2();
              }
            });

    JavaDStream<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String x) {
                return Arrays.asList(SPACE.split(x));
              }
            });
    JavaPairDStream<String, Integer> wordCounts =
        words
            .mapToPair(
                new PairFunction<String, String, Integer>() {
                  @Override
                  public Tuple2<String, Integer> call(String s) {
                    return new Tuple2<>(s, 1);
                  }
                })
            .reduceByKey(
                new Function2<Integer, Integer, Integer>() {
                  @Override
                  public Integer call(Integer i1, Integer i2) {
                    return i1 + i2;
                  }
                });
    wordCounts.print();

    // Start the computation
    jssc.start();
    jssc.awaitTermination();
  }
  public static void main(String[] args) {
    if (args.length < 5) {
      System.out.println(
          "Usage: StreamProducerJava <infinispan_host> <twitter4j.oauth.consumerKey> <twitter4j.oauth.consumerSecret> <twitter4j.oauth.accessToken> <twitter4j.oauth.accessTokenSecret>");
      System.exit(1);
    }

    String infinispanHost = args[0];
    System.setProperty("twitter4j.oauth.consumerKey", args[1]);
    System.setProperty("twitter4j.oauth.consumerSecret", args[2]);
    System.setProperty("twitter4j.oauth.accessToken", args[3]);
    System.setProperty("twitter4j.oauth.accessTokenSecret", args[4]);

    // Reduce the log level in the driver
    Logger.getLogger("org").setLevel(Level.WARN);

    SparkConf conf = new SparkConf().setAppName("spark-infinispan-stream-producer-java");

    // Create the streaming context
    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Seconds.apply(1));

    // Populate infinispan properties
    Properties infinispanProperties = new Properties();
    infinispanProperties.put("infinispan.client.hotrod.server_list", infinispanHost);

    JavaReceiverInputDStream<Status> twitterDStream =
        TwitterUtils.createStream(javaStreamingContext);

    // Transform from twitter4j.Status to our domain model org.infinispan.spark.demo.twitter.Tweet
    JavaDStream<Tuple2<Long, Tweet>> kvPair =
        twitterDStream.map(
            status ->
                new Tuple2<>(
                    status.getId(),
                    new Tweet(
                        status.getId(),
                        status.getUser().getScreenName(),
                        Optional.ofNullable(status.getPlace())
                            .map(Place::getCountry)
                            .orElseGet(() -> "N/A"),
                        status.getRetweetCount(),
                        status.getText())));

    // Write the stream to infinispan
    InfinispanJavaDStream.writeToInfinispan(kvPair, infinispanProperties);

    // Create InfinispanInputDStream
    JavaInputDStream<Tuple3<Long, Tweet, ClientEvent.Type>> infinispanInputDStream =
        InfinispanJavaDStream.createInfinispanInputDStream(
            javaStreamingContext, MEMORY_ONLY(), infinispanProperties);

    // Apply a transformation to the RDDs to aggregate by country
    JavaPairDStream<String, Integer> countryDStream =
        infinispanInputDStream.transformToPair(
            rdd -> {
              return rdd.filter(ev -> !ev._2().getCountry().equals("N/A"))
                  .mapToPair(event -> new Tuple2<>(event._2().getCountry(), 1))
                  .reduceByKey((a, b) -> a + b);
            });

    // Since we are interested in the last 60 seconds only, we restrict the DStream by window,
    // collapsing all the RDDs:
    JavaPairDStream<String, Integer> lastMinuteStream =
        countryDStream.reduceByKeyAndWindow((a, b) -> a + b, new Duration(60 * 1000));

    lastMinuteStream.foreachRDD(
        (rdd, time) -> {
          System.out.format("---------- %s ----------\n", time.toString());
          List<Tuple2<String, Integer>> results = rdd.collect();
          results
              .stream()
              .sorted((o1, o2) -> o2._2().compareTo(o1._2()))
              .forEach(t -> System.out.format("[%s,%d]\n", t._1(), t._2()));
          return null;
        });

    // Start the processing
    javaStreamingContext.start();

    javaStreamingContext.awaitTermination();
  }
Example #22
0
  public static void main(String[] args) {
    Logger logger = Logger.getRootLogger();
    logger.setLevel(Level.OFF);

    String consumerKey = "JqQ1lAWg90PVD9U8XoDWedCm8";
    String consumerSecret = "QaUe7V9HuYQvC031MVqpUuuP2OjieI0BBDEHLpFOR221zjQ0xp";
    String accessToken = "3299869044-UVd8CwTfnDgcGFGPro2yGXKWhArKtXRxC6iekmH";
    String accessTokenSecret = "3XtGQi1naI1V9wCVs2aQgEeVWr65vXDczOwGvqa3iGlEG";

    System.setProperty("twitter4j.oauth.consumerKey", consumerKey);
    System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret);
    System.setProperty("twitter4j.oauth.accessToken", accessToken);
    System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret);

    String[] filters = {"bulling", "bullied", "bulling", "bullyed", "bully", "teased"};

    SparkConf sparkConf = new SparkConf().setAppName("bullyhunter");
    System.out.println("Started bullyhunter...");
    JavaStreamingContext sc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
    JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(sc, filters);

    JavaDStream<String> text =
        stream.map(
            new Function<Status, String>() {
              public String call(Status status) {
                //                        String msg = status.getText();
                //                        String filtered_msg = Enrichment.filter(msg);
                //                        if (filtered_msg == null) {
                //                            return null;
                //                        }
                //                        TweetRecord tr = new TweetRecord();
                //                        tr.setMsg(filtered_msg);
                //                        //tr.setGeo(status.getGeoLocation().getLatitude());
                //                        String fullName = status.getPlace().getFullName();
                //                        if (fullName == null)
                //                            return null;
                //                        String[] fields = fullName.spilt(DELIMITER);
                //                        tr.setCity(fullName.split());
                String msg = status.getText();
                double ind = Classification.classifyTweet(msg);
                if (ind > 0) {
                  return status.getText();
                } else {
                  return null;
                }
              }
            });

    //        text = text.filter(new Function<String, Boolean>() {
    //            public Boolean call(String msg) {
    //                boolean containKeyword = false;
    //                String lowerCase = msg.toLowerCase();
    //                for (String k : keywords)
    //                    if (lowerCase.contains(k)) {
    //                        containKeyword = true;
    //                        break;
    //                    }
    //                if (containKeyword == true && lowerCase.contains("bull")
    //                        && !lowerCase.contains("RT")) {
    //                    return true;
    //                }
    //                return false;
    //            }
    //
    //        });
    text =
        text.filter(
            new Function<String, Boolean>() {
              public Boolean call(String msg) {
                return (msg == null) ? false : true;
              }
            });

    text.print();
    sc.start();
    sc.awaitTermination();
  }
  @Test
  public void testKafkaStream() throws InterruptedException {
    final String topic1 = "topic1";
    final String topic2 = "topic2";
    // hold a reference to the current offset ranges, so it can be used downstream
    final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>();

    String[] topic1data = createTopicAndSendData(topic1);
    String[] topic2data = createTopicAndSendData(topic2);

    Set<String> sent = new HashSet<>();
    sent.addAll(Arrays.asList(topic1data));
    sent.addAll(Arrays.asList(topic2data));

    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
    kafkaParams.put("auto.offset.reset", "smallest");

    JavaDStream<String> stream1 =
        KafkaUtils.createDirectStream(
                ssc,
                String.class,
                String.class,
                StringDecoder.class,
                StringDecoder.class,
                kafkaParams,
                topicToSet(topic1))
            .transformToPair(
                // Make sure you can get offset ranges from the rdd
                new Function<JavaPairRDD<String, String>, JavaPairRDD<String, String>>() {
                  @Override
                  public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) {
                    OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
                    offsetRanges.set(offsets);
                    Assert.assertEquals(topic1, offsets[0].topic());
                    return rdd;
                  }
                })
            .map(
                new Function<Tuple2<String, String>, String>() {
                  @Override
                  public String call(Tuple2<String, String> kv) {
                    return kv._2();
                  }
                });

    JavaDStream<String> stream2 =
        KafkaUtils.createDirectStream(
            ssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            String.class,
            kafkaParams,
            topicOffsetToMap(topic2, 0L),
            new Function<MessageAndMetadata<String, String>, String>() {
              @Override
              public String call(MessageAndMetadata<String, String> msgAndMd) {
                return msgAndMd.message();
              }
            });
    JavaDStream<String> unifiedStream = stream1.union(stream2);

    final Set<String> result = Collections.synchronizedSet(new HashSet<String>());
    unifiedStream.foreachRDD(
        new VoidFunction<JavaRDD<String>>() {
          @Override
          public void call(JavaRDD<String> rdd) {
            result.addAll(rdd.collect());
            for (OffsetRange o : offsetRanges.get()) {
              System.out.println(
                  o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
            }
          }
        });
    ssc.start();
    long startTime = System.currentTimeMillis();
    boolean matches = false;
    while (!matches && System.currentTimeMillis() - startTime < 20000) {
      matches = sent.size() == result.size();
      Thread.sleep(50);
    }
    Assert.assertEquals(sent, result);
    ssc.stop();
  }