public static void main(String[] args) {
    if (args.length < 4) {
      System.err.println("Usage: PDCKafkaConsumer <zkQuorum> <group> <topics> <numThreads>");
      System.exit(1);
    }

    String zkQuorum = args[0];
    String kfGrp = args[1];
    String[] topics = args[2].split(",");
    int numThreads = Integer.valueOf(args[3]);

    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    for (String topic : topics) {
      topicMap.put(topic, numThreads);
    }

    SparkConf conf = new SparkConf().setAppName("PDCKafkaConsumer");
    conf.set("spark.ui.port", "4040");
    JavaStreamingContext ctx = new JavaStreamingContext(conf, new Duration(10000));
    JavaPairReceiverInputDStream<String, String> kfStream =
        KafkaUtils.createStream(ctx, zkQuorum, kfGrp, topicMap);
    kfStream.saveAsHadoopFiles(
        "/phasor/pmu/pdc", "in", Text.class, Text.class, TextOutputFormat.class);

    ctx.start();
    ctx.awaitTermination();
  }
  private static JavaStreamingContext createContext(String input, String checkpointDirectory) {
    System.out.println("Creating new context");
    // final File outputFile = new File("/flume_recover");
    // if (outputFile.exists()) {
    // outputFile.delete();
    // }

    SparkConf conf =
        new SparkConf()
            .setMaster("local[2]")
            .setAppName("Stream File")
            .set("spark.driver.allowMultipleContexts", "true");
    conf.set("spark.serializer", KryoSerializer.class.getName());
    conf.set("es.index.auto.create", "true");
    conf.set("es.nodes", "10.26.1.134:9200");
    conf.set("es.resource", "flume/test");
    conf.set("es.input.json", "true");

    JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(3000));
    jssc.checkpoint(checkpointDirectory);

    JavaDStream<String> textFile = jssc.textFileStream(input);
    JavaDStream<String> jsonStr =
        textFile.map(
            new Function<String, String>() {
              public String call(String arg0) throws Exception {
                Matcher m = log.matcher(arg0);
                if (m.find()) {
                  return transferJson(m);
                }
                return "";
              }
            });
    jsonStr.print();

    jsonStr.foreach(
        new Function<JavaRDD<String>, Void>() {
          public Void call(JavaRDD<String> arg0) throws Exception {
            if (!arg0.isEmpty() && arg0 != null) {
              JavaEsSpark.saveToEs(arg0, "flume/test");
            }
            return null;
          }
        });

    return jssc;
  }
 public static void create(final Configuration configuration) {
   final SparkConf sparkConf = new SparkConf();
   configuration
       .getKeys()
       .forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString()));
   sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin");
   CONTEXT = SparkContext.getOrCreate(sparkConf);
 }
 public SparkOperatorCreater(String appName) throws IOException {
   super(appName);
   properties = new Properties();
   properties.load(
       this.getClass().getClassLoader().getResourceAsStream("spark-cluster.properties"));
   SparkConf conf = new SparkConf().setMaster(this.getMaster()).setAppName(appName);
   conf.set("spark.streaming.ui.retainedBatches", "2000");
   jssc = new JavaStreamingContext(conf, Durations.milliseconds(this.getDurationsMilliseconds()));
 }
 private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) {
   if (contextOptions.getUsesProvidedSparkContext()) {
     LOG.info("Using a provided Spark Context");
     JavaSparkContext jsc = contextOptions.getProvidedSparkContext();
     if (jsc == null || jsc.sc().isStopped()) {
       LOG.error("The provided Spark context " + jsc + " was not created or was stopped");
       throw new RuntimeException("The provided Spark context was not created or was stopped");
     }
     return jsc;
   } else {
     LOG.info("Creating a brand new Spark Context.");
     SparkConf conf = new SparkConf();
     if (!conf.contains("spark.master")) {
       // set master if not set.
       conf.setMaster(contextOptions.getSparkMaster());
     }
     conf.setAppName(contextOptions.getAppName());
     // register immutable collections serializers because the SDK uses them.
     conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName());
     conf.set("spark.serializer", KryoSerializer.class.getName());
     return new JavaSparkContext(conf);
   }
 }
Beispiel #6
0
  @Override
  public int run(SparkConf conf, CommandLine cli) throws Exception {

    long startMs = System.currentTimeMillis();

    conf.set("spark.ui.enabled", "false");

    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    long diffMs = (System.currentTimeMillis() - startMs);
    System.out.println(">> took " + diffMs + " ms to create SQLContext");

    Map<String, String> options = new HashMap<>();
    options.put("zkhost", "localhost:9983");
    options.put("collection", "ml20news");
    options.put("query", "content_txt:[* TO *]");
    options.put("fields", "content_txt");

    DataFrame solrData = sqlContext.read().format("solr").options(options).load();
    DataFrame sample = solrData.sample(false, 0.1d, 5150).select("content_txt");
    List<Row> rows = sample.collectAsList();
    System.out.println(">> loaded " + rows.size() + " docs to classify");

    StructType schema = sample.schema();

    CrossValidatorModel cvModel = CrossValidatorModel.load("ml-pipeline-model");
    PipelineModel bestModel = (PipelineModel) cvModel.bestModel();

    int r = 0;
    startMs = System.currentTimeMillis();
    for (Row next : rows) {
      Row oneRow = RowFactory.create(next.getString(0));
      DataFrame oneRowDF =
          sqlContext.createDataFrame(Collections.<Row>singletonList(oneRow), schema);
      DataFrame scored = bestModel.transform(oneRowDF);
      Row scoredRow = scored.collect()[0];
      String predictedLabel = scoredRow.getString(scoredRow.fieldIndex("predictedLabel"));

      // an acutal app would save the predictedLabel
      // System.out.println(">> for row["+r+"], model returned "+scoredRows.length+" rows,
      // "+scoredRows[0]);

      r++;
    }
    diffMs = (System.currentTimeMillis() - startMs);
    System.out.println(">> took " + diffMs + " ms to score " + rows.size() + " docs");

    return 0;
  }
  public static void main(String[] args) {
    SparkConf conf =
        new SparkConf()
            .setMaster("local[1]")
            .setAppName(RDDParallelizeSample.class.getSimpleName());
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // create a List of Characters
    List<Character> characterList = new ArrayList<Character>();
    characterList.addAll(
        Arrays.asList(
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q'));

    // create an RDD from an internal List using parallelize method
    JavaRDD<Character> characterRDD = sc.parallelize(characterList);

    System.out.println("list size : " + characterList.size());
    System.out.println("rdd size : " + characterRDD.count());

    System.out.println("list content : " + characterList);
    System.out.println("rdd content : " + characterRDD.collect());
  }
  public static void main(String args[]) {
    SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
    conf.set("es.index.auto.create", "true");
    JavaSparkContext context = new JavaSparkContext(conf);

    JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv");

    JavaRDD<Crime> dataSplits =
        textFile.map(
            line -> {
              CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180);
              Crime c = new Crime();
              CSVRecord record = parser.getRecords().get(0);
              c.setId(record.get(0));
              c.setCaseNumber(record.get(1));
              c.setEventDate(record.get(2));
              c.setBlock(record.get(3));
              c.setIucr(record.get(4));
              c.setPrimaryType(record.get(5));
              c.setDescription(record.get(6));
              c.setLocation(record.get(7));
              c.setArrest(Boolean.parseBoolean(record.get(8)));
              c.setDomestic(Boolean.parseBoolean(record.get(9)));
              String lat = record.get(10);
              String lon = record.get(11);
              Map<String, Double> geoLocation = new HashMap<>();
              geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat));
              geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon));
              c.setGeoLocation(geoLocation);
              return c;
            });

    SQLContext sqlContext = new SQLContext(context);
    DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class);

    JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection");
  }
  /** Main method for performing the random partition based model ensembler evaluation */
  public static void main(String[] args) {

    // Construction of Spark Configuration
    SparkConf sContext = new SparkConf();
    sContext.setMaster("local[4]");
    sContext.setAppName("JavaLR");
    sContext.set("spark.executor.memory", "4G");

    // Creates the spark context
    sc = new JavaSparkContext(sContext); // "local[4]", "JavaLR");

    // Load train and test data
    JavaRDD<String> trainingData =
        readData("/Users/erangap/Documents/ML_Project/datasets/trainImputedNormalized.csv", "Id")
            .sample(false, 0.1, 11L);
    JavaRDD<String> testdata =
        readData("/Users/erangap/Documents/ML_Project/datasets/testImputedNormalized.csv", "Id")
            .sample(false, 0.1, 11L);

    // trainingData.saveAsTextFile("/Users/erangap/Documents/ML_Project/datasets/reduced.csv");
    JavaRDD<LabeledPoint> points = trainingData.map(new ParsePoint());
    // points.persist(StorageLevel.MEMORY_AND_DISK());
    // System.out.println(points.first().features());
    JavaRDD<LabeledPoint> testPoints = testdata.map(new ParsePoint());
    // testPoints.persist(StorageLevel.MEMORY_AND_DISK());

    System.out.println("Total number of records -> " + points.count());

    RandomPartitionedEnSembler ensembler = new RandomPartitionedEnSembler();
    ensembler.setNoofModels(32);
    ensembler.setThreshold(0.499999);

    // Perform the training
    DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
    Date trainStartTime = Calendar.getInstance().getTime();
    String trainStart = dateFormat.format(trainStartTime);
    ensembler.train(points);
    Date trainEndTime = Calendar.getInstance().getTime();
    String trainEnd = dateFormat.format(trainEndTime);

    // Training time calculations and console print
    long trainElapsed = (trainEndTime.getTime() - trainStartTime.getTime()) / 1000;
    System.out.println("Training Started at -> " + trainStart);
    System.out.println("Training Ended at -> " + trainEnd);
    System.out.println("Time Taken to Train -> " + trainElapsed + " Sec.");

    // Prepare data for testing
    JavaRDD<Double> testingLabels =
        testPoints
            .map(
                new Function<LabeledPoint, Double>() {

                  private static final long serialVersionUID = -6597374940461185814L;

                  public Double call(LabeledPoint dataPoint) throws Exception {
                    return dataPoint.label();
                  }
                })
            .cache();
    List<Double> classLabels = testingLabels.toArray();

    // Perform the predictions
    Date predictStartTime = Calendar.getInstance().getTime();
    String predictStart = dateFormat.format(predictStartTime);
    List<Double> predictedLabels = ensembler.voteAndPredit(testPoints).toArray();
    Date predictEndTime = Calendar.getInstance().getTime();
    String predictEnd = dateFormat.format(predictEndTime);

    // Predict time calculations and console print
    long preditElapsed = (predictEndTime.getTime() - predictStartTime.getTime()) / 1000;
    System.out.println("Prediction Started at -> " + predictStart);
    System.out.println("Prediction Ended at -> " + predictEnd);
    System.out.println("Time Taken to Predit -> " + preditElapsed + " Sec.");

    // Calculate and Display the accuracy
    System.out.println("Testing accuracy (%): " + Metrics.accuracy(classLabels, predictedLabels));
    BinaryClassificationMetrics binaryClassificationMetrics =
        getBinaryClassificationMatrix(ensembler, testPoints);
    System.out.println("Area under the curve -> " + binaryClassificationMetrics.areaUnderROC());
  }
  public static void main(String[] args) throws IOException {
    Parameters param = new Parameters();
    long initTime = System.currentTimeMillis();

    SparkConf conf = new SparkConf().setAppName("StarJoin");
    JavaSparkContext sc = new JavaSparkContext(conf);

    if (param.useKryo) {
      conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
      conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName());
      conf.set("spark.kryoserializer.buffer.mb", param.buffer);
    }

    MyBloomFilter.BloomFilter<String> BFS =
        new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes);
    MyBloomFilter.BloomFilter<String> BFD =
        new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes);
    MyBloomFilter.BloomFilter<String> BFC =
        new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes);

    JavaPairRDD<String, String> supps =
        sc.textFile(param.suppPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> s = supps.collect();
    for (int i = 0; i < s.size(); i++) {
      BFS.add(s.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS);

    JavaPairRDD<String, String> custs =
        sc.textFile(param.custPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> c = custs.collect();
    for (int i = 0; i < c.size(); i++) {
      BFC.add(c.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC);

    JavaPairRDD<String, String> dates =
        sc.textFile(param.datePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[6].equals("Dec1997");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[4]);
                  }
                });

    List<Tuple2<String, String>> d = dates.collect();
    for (int i = 0; i < d.size(); i++) {
      BFD.add(d.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD);

    JavaPairRDD<String, String[]> lines =
        sc.textFile(param.linePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return varC.value().contains(s[2].getBytes())
                        & varS.value().contains(s[4].getBytes())
                        & varD.value().contains(s[5].getBytes());
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String[]>() {
                  public Tuple2<String, String[]> call(String[] s) {
                    String[] v = {s[2], s[5], s[12]};
                    return new Tuple2<String, String[]>(s[4], v);
                  }
                });

    JavaPairRDD<String, String[]> result =
        lines
            .join(supps)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    result =
        result
            .join(custs)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    JavaPairRDD<String, Long> final_result =
        result
            .join(dates)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) {
                    return new Tuple2<String, Long>(
                        s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0]));
                  }
                })
            .reduceByKey(
                new Function2<Long, Long, Long>() {
                  public Long call(Long i1, Long i2) {
                    return i1 + i2;
                  }
                });

    JavaPairRDD<String, String> sub_result =
        final_result.mapToPair(
            new PairFunction<Tuple2<String, Long>, String, String>() {
              public Tuple2<String, String> call(Tuple2<String, Long> line) {
                return new Tuple2(line._1 + "," + line._2.toString(), null);
              }
            });

    final_result =
        sub_result
            .sortByKey(new Q3Comparator())
            .mapToPair(
                new PairFunction<Tuple2<String, String>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, String> line) {
                    String[] s = line._1.split(",");
                    return new Tuple2<String, Long>(
                        s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3]));
                  }
                });

    Configuration HDFSconf = new Configuration();
    FileSystem fs = FileSystem.get(HDFSconf);
    fs.delete(new Path(param.output), true);

    final_result.saveAsTextFile(param.output);

    long finalTime = System.currentTimeMillis();
    System.out.print("Tempo total(ms): ");
    System.out.println(finalTime - initTime);

    sc.close();
  }
  public void run() {

    System.setProperty("spark.hadoop.dfs.replication", "2");

    Logger.getLogger("org").setLevel(Level.OFF);
    Logger.getLogger("akka").setLevel(Level.OFF);

    SparkConf conf = new SparkConf().setAppName("WindowingKafkaWordCountWithFaultTolerance");
    conf.set("spark.master", PropertiesStack.getProperty("spark.master"));
    conf.set("spark.executor.memory", PropertiesStack.getProperty("spark.executor.memory"));
    conf.set("spark.driver.memory", PropertiesStack.getProperty("spark.driver.memory"));
    conf.set(
        "spark.driver.maxResultSize", PropertiesStack.getProperty("spark.driver.maxResultSize"));
    // .setAppName("WindowingKafkaWordCountWithoutFaultTolerance");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10));

    HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(PropertiesStack.getKafkaTopic()));

    HashMap<String, String> kafkaParams = new HashMap<String, String>();
    kafkaParams.put("metadata.broker.list", PropertiesStack.getKafkaBootstrapServers());
    kafkaParams.put("zookeeper.connect", PropertiesStack.getZookeeperConnect());
    kafkaParams.put("auto.offset.reset", "smallest");
    kafkaParams.put("group.id", PropertiesStack.getKafkaGroupId());
    kafkaParams.put("auto.commit.enable", "false");

    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    topicMap.put(PropertiesStack.getKafkaTopic(), 1);
    //		Map<kafka.common.TopicAndPartition, java.lang.Long> fromOffsets = new HashMap<>();
    //		fromOffsets.put(new TopicAndPartition(PropertiesStack.getKafkaTopic(),
    //				1), 1000L);
    // Create direct kafka stream with brokers and topics
    //		JavaInputDStream<String> messages = KafkaUtils
    //				.createDirectStream(
    //						jssc,
    //						String.class,
    //						String.class,
    //						StringDecoder.class,
    //						StringDecoder.class,
    //						String.class,
    //						kafkaParams,
    //						fromOffsets,
    //						new Function<kafka.message.MessageAndMetadata<String, String>, String>() {
    //							@Override
    //							public String call(
    //									MessageAndMetadata<String, String> v1)
    //									throws Exception {
    //								return v1.message();
    //							}
    //						});
    JavaPairInputDStream<String, String> messages =
        KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet);
    messages.count().print();
    // .createStream(jssc, PropertiesStack.getZookeeperConnect(),
    // PropertiesStack.getKafkaGroupId(), topicMap);

    // Start the computation
    jssc.start();
    jssc.awaitTermination();
  }