public static void main(String[] args) {
    if (args.length < 4) {
      System.err.println("Usage: PDCKafkaConsumer <zkQuorum> <group> <topics> <numThreads>");
      System.exit(1);
    }

    String zkQuorum = args[0];
    String kfGrp = args[1];
    String[] topics = args[2].split(",");
    int numThreads = Integer.valueOf(args[3]);

    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    for (String topic : topics) {
      topicMap.put(topic, numThreads);
    }

    SparkConf conf = new SparkConf().setAppName("PDCKafkaConsumer");
    conf.set("spark.ui.port", "4040");
    JavaStreamingContext ctx = new JavaStreamingContext(conf, new Duration(10000));
    JavaPairReceiverInputDStream<String, String> kfStream =
        KafkaUtils.createStream(ctx, zkQuorum, kfGrp, topicMap);
    kfStream.saveAsHadoopFiles(
        "/phasor/pmu/pdc", "in", Text.class, Text.class, TextOutputFormat.class);

    ctx.start();
    ctx.awaitTermination();
  }
 /**
  * Merge zero or more spill files together, choosing the fastest merging strategy based on the
  * number of spills and the IO compression codec.
  *
  * @return the partition lengths in the merged file.
  */
 private long[] mergeSpills(SpillInfo[] spills) throws IOException {
   final File outputFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
   final boolean compressionEnabled = sparkConf.getBoolean("spark.shuffle.compress", true);
   final CompressionCodec compressionCodec = CompressionCodec$.MODULE$.createCodec(sparkConf);
   final boolean fastMergeEnabled =
       sparkConf.getBoolean("spark.shuffle.unsafe.fastMergeEnabled", true);
   final boolean fastMergeIsSupported =
       !compressionEnabled || compressionCodec instanceof LZFCompressionCodec;
   try {
     if (spills.length == 0) {
       new FileOutputStream(outputFile).close(); // Create an empty file
       return new long[partitioner.numPartitions()];
     } else if (spills.length == 1) {
       // Here, we don't need to perform any metrics updates because the bytes written to this
       // output file would have already been counted as shuffle bytes written.
       Files.move(spills[0].file, outputFile);
       return spills[0].partitionLengths;
     } else {
       final long[] partitionLengths;
       // There are multiple spills to merge, so none of these spill files' lengths were counted
       // towards our shuffle write count or shuffle write time. If we use the slow merge path,
       // then the final output file's size won't necessarily be equal to the sum of the spill
       // files' sizes. To guard against this case, we look at the output file's actual size when
       // computing shuffle bytes written.
       //
       // We allow the individual merge methods to report their own IO times since different merge
       // strategies use different IO techniques.  We count IO during merge towards the shuffle
       // shuffle write time, which appears to be consistent with the "not bypassing merge-sort"
       // branch in ExternalSorter.
       if (fastMergeEnabled && fastMergeIsSupported) {
         // Compression is disabled or we are using an IO compression codec that supports
         // decompression of concatenated compressed streams, so we can perform a fast spill merge
         // that doesn't need to interpret the spilled bytes.
         if (transferToEnabled) {
           logger.debug("Using transferTo-based fast merge");
           partitionLengths = mergeSpillsWithTransferTo(spills, outputFile);
         } else {
           logger.debug("Using fileStream-based fast merge");
           partitionLengths = mergeSpillsWithFileStream(spills, outputFile, null);
         }
       } else {
         logger.debug("Using slow merge");
         partitionLengths = mergeSpillsWithFileStream(spills, outputFile, compressionCodec);
       }
       // When closing an UnsafeShuffleExternalSorter that has already spilled once but also has
       // in-memory records, we write out the in-memory records to a file but do not count that
       // final write as bytes spilled (instead, it's accounted as shuffle write). The merge needs
       // to be counted as shuffle write, but this will lead to double-counting of the final
       // SpillInfo's bytes.
       writeMetrics.decShuffleBytesWritten(spills[spills.length - 1].file.length());
       writeMetrics.incShuffleBytesWritten(outputFile.length());
       return partitionLengths;
     }
   } catch (IOException e) {
     if (outputFile.exists() && !outputFile.delete()) {
       logger.error("Unable to delete output file {}", outputFile.getPath());
     }
     throw e;
   }
 }
Пример #3
0
 public static void create(final Configuration configuration) {
   final SparkConf sparkConf = new SparkConf();
   configuration
       .getKeys()
       .forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString()));
   sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin");
   CONTEXT = SparkContext.getOrCreate(sparkConf);
 }
 public SparkOperatorCreater(String appName) throws IOException {
   super(appName);
   properties = new Properties();
   properties.load(
       this.getClass().getClassLoader().getResourceAsStream("spark-cluster.properties"));
   SparkConf conf = new SparkConf().setMaster(this.getMaster()).setAppName(appName);
   conf.set("spark.streaming.ui.retainedBatches", "2000");
   jssc = new JavaStreamingContext(conf, Durations.milliseconds(this.getDurationsMilliseconds()));
 }
 public BypassMergeSortShuffleWriter(
     SparkConf conf,
     BlockManager blockManager,
     Partitioner partitioner,
     ShuffleWriteMetrics writeMetrics,
     Serializer serializer) {
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
   this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
   this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true);
   this.numPartitions = partitioner.numPartitions();
   this.blockManager = blockManager;
   this.partitioner = partitioner;
   this.writeMetrics = writeMetrics;
   this.serializer = serializer;
 }
Пример #6
0
 @Test
 public void emptyConfigurationVariablesOnlyForNonSparkProperties() {
   Properties intpProperty = repl.getProperty();
   SparkConf sparkConf = repl.getSparkContext().getConf();
   for (Object oKey : intpProperty.keySet()) {
     String key = (String) oKey;
     String value = (String) intpProperty.get(key);
     LOGGER.debug(String.format("[%s]: [%s]", key, value));
     if (key.startsWith("spark.") && value.isEmpty()) {
       assertTrue(
           String.format("configuration starting from 'spark.' should not be empty. [%s]", key),
           !sparkConf.contains(key) || !sparkConf.get(key).isEmpty());
     }
   }
 }
 public UnsafeShuffleWriter(
     BlockManager blockManager,
     IndexShuffleBlockResolver shuffleBlockResolver,
     TaskMemoryManager memoryManager,
     ShuffleMemoryManager shuffleMemoryManager,
     UnsafeShuffleHandle<K, V> handle,
     int mapId,
     TaskContext taskContext,
     SparkConf sparkConf)
     throws IOException {
   final int numPartitions = handle.dependency().partitioner().numPartitions();
   if (numPartitions > UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS()) {
     throw new IllegalArgumentException(
         "UnsafeShuffleWriter can only be used for shuffles with at most "
             + UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS()
             + " reduce partitions");
   }
   this.blockManager = blockManager;
   this.shuffleBlockResolver = shuffleBlockResolver;
   this.memoryManager = memoryManager;
   this.shuffleMemoryManager = shuffleMemoryManager;
   this.mapId = mapId;
   final ShuffleDependency<K, V, V> dep = handle.dependency();
   this.shuffleId = dep.shuffleId();
   this.serializer = Serializer.getSerializer(dep.serializer()).newInstance();
   this.partitioner = dep.partitioner();
   this.writeMetrics = new ShuffleWriteMetrics();
   taskContext.taskMetrics().shuffleWriteMetrics_$eq(Option.apply(writeMetrics));
   this.taskContext = taskContext;
   this.sparkConf = sparkConf;
   this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true);
   open();
 }
  private static JavaStreamingContext createContext(String input, String checkpointDirectory) {
    System.out.println("Creating new context");
    // final File outputFile = new File("/flume_recover");
    // if (outputFile.exists()) {
    // outputFile.delete();
    // }

    SparkConf conf =
        new SparkConf()
            .setMaster("local[2]")
            .setAppName("Stream File")
            .set("spark.driver.allowMultipleContexts", "true");
    conf.set("spark.serializer", KryoSerializer.class.getName());
    conf.set("es.index.auto.create", "true");
    conf.set("es.nodes", "10.26.1.134:9200");
    conf.set("es.resource", "flume/test");
    conf.set("es.input.json", "true");

    JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(3000));
    jssc.checkpoint(checkpointDirectory);

    JavaDStream<String> textFile = jssc.textFileStream(input);
    JavaDStream<String> jsonStr =
        textFile.map(
            new Function<String, String>() {
              public String call(String arg0) throws Exception {
                Matcher m = log.matcher(arg0);
                if (m.find()) {
                  return transferJson(m);
                }
                return "";
              }
            });
    jsonStr.print();

    jsonStr.foreach(
        new Function<JavaRDD<String>, Void>() {
          public Void call(JavaRDD<String> arg0) throws Exception {
            if (!arg0.isEmpty() && arg0 != null) {
              JavaEsSpark.saveToEs(arg0, "flume/test");
            }
            return null;
          }
        });

    return jssc;
  }
Пример #9
0
  private static void init() {
    SparkConf conf = new SparkConf();
    conf.setAppName("binend countByValue");
    conf.setMaster("spark://localhost:7077");

    JavaSparkContext jsc = new JavaSparkContext(conf);
    jsc.addJar(
        "/home/titanic/soft/intelijWorkspace/github-spark/com-hadoop-spark/target/com-hadoop-spark-1.0-SNAPSHOT.jar");

    List<Integer> list = new ArrayList<Integer>();

    for (int x = 0; x <= 10; x++) {
      list.add(x);
    }

    JavaRDD<Integer> rdd = jsc.parallelize(list);
    Map<Integer, Long> map = rdd.countByValue();
    System.out.println(map);
  }
Пример #10
0
  public static void main(String[] args) throws Exception {

    Schema schema =
        new Schema.Builder()
            .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width")
            .addColumnInteger("Species")
            .build();

    SparkConf conf = new SparkConf();
    conf.setMaster("local[*]");
    conf.setAppName("DataVec Example");

    JavaSparkContext sc = new JavaSparkContext(conf);

    String directory =
        new ClassPathResource("IrisData/iris.txt")
            .getFile()
            .getParent(); // Normally just define your directory like "file:/..." or "hdfs:/..."
    JavaRDD<String> stringData = sc.textFile(directory);

    // We first need to parse this comma-delimited (CSV) format; we can do this using
    // CSVRecordReader:
    RecordReader rr = new CSVRecordReader();
    JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr));

    int maxHistogramBuckets = 10;
    DataAnalysis dataAnalysis = AnalyzeSpark.analyze(schema, parsedInputData, maxHistogramBuckets);

    System.out.println(dataAnalysis);

    // We can get statistics on a per-column basis:
    DoubleAnalysis da = (DoubleAnalysis) dataAnalysis.getColumnAnalysis("Sepal length");
    double minValue = da.getMin();
    double maxValue = da.getMax();
    double mean = da.getMean();

    HtmlAnalysis.createHtmlAnalysisFile(dataAnalysis, new File("DataVecIrisAnalysis.html"));

    // To write to HDFS instead:
    // String htmlAnalysisFileContents = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis);
    // SparkUtils.writeStringToFile("hdfs://your/hdfs/path/here",htmlAnalysisFileContents,sc);
  }
  public static void main(String[] args) {
    SparkConf conf =
        new SparkConf()
            .setMaster("local[1]")
            .setAppName(RDDParallelizeSample.class.getSimpleName());
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // create a List of Characters
    List<Character> characterList = new ArrayList<Character>();
    characterList.addAll(
        Arrays.asList(
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q'));

    // create an RDD from an internal List using parallelize method
    JavaRDD<Character> characterRDD = sc.parallelize(characterList);

    System.out.println("list size : " + characterList.size());
    System.out.println("rdd size : " + characterRDD.count());

    System.out.println("list content : " + characterList);
    System.out.println("rdd content : " + characterRDD.collect());
  }
Пример #12
0
  @Override
  public int run(SparkConf conf, CommandLine cli) throws Exception {

    long startMs = System.currentTimeMillis();

    conf.set("spark.ui.enabled", "false");

    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    long diffMs = (System.currentTimeMillis() - startMs);
    System.out.println(">> took " + diffMs + " ms to create SQLContext");

    Map<String, String> options = new HashMap<>();
    options.put("zkhost", "localhost:9983");
    options.put("collection", "ml20news");
    options.put("query", "content_txt:[* TO *]");
    options.put("fields", "content_txt");

    DataFrame solrData = sqlContext.read().format("solr").options(options).load();
    DataFrame sample = solrData.sample(false, 0.1d, 5150).select("content_txt");
    List<Row> rows = sample.collectAsList();
    System.out.println(">> loaded " + rows.size() + " docs to classify");

    StructType schema = sample.schema();

    CrossValidatorModel cvModel = CrossValidatorModel.load("ml-pipeline-model");
    PipelineModel bestModel = (PipelineModel) cvModel.bestModel();

    int r = 0;
    startMs = System.currentTimeMillis();
    for (Row next : rows) {
      Row oneRow = RowFactory.create(next.getString(0));
      DataFrame oneRowDF =
          sqlContext.createDataFrame(Collections.<Row>singletonList(oneRow), schema);
      DataFrame scored = bestModel.transform(oneRowDF);
      Row scoredRow = scored.collect()[0];
      String predictedLabel = scoredRow.getString(scoredRow.fieldIndex("predictedLabel"));

      // an acutal app would save the predictedLabel
      // System.out.println(">> for row["+r+"], model returned "+scoredRows.length+" rows,
      // "+scoredRows[0]);

      r++;
    }
    diffMs = (System.currentTimeMillis() - startMs);
    System.out.println(">> took " + diffMs + " ms to score " + rows.size() + " docs");

    return 0;
  }
  public static void main(String args[]) {
    SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
    conf.set("es.index.auto.create", "true");
    JavaSparkContext context = new JavaSparkContext(conf);

    JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv");

    JavaRDD<Crime> dataSplits =
        textFile.map(
            line -> {
              CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180);
              Crime c = new Crime();
              CSVRecord record = parser.getRecords().get(0);
              c.setId(record.get(0));
              c.setCaseNumber(record.get(1));
              c.setEventDate(record.get(2));
              c.setBlock(record.get(3));
              c.setIucr(record.get(4));
              c.setPrimaryType(record.get(5));
              c.setDescription(record.get(6));
              c.setLocation(record.get(7));
              c.setArrest(Boolean.parseBoolean(record.get(8)));
              c.setDomestic(Boolean.parseBoolean(record.get(9)));
              String lat = record.get(10);
              String lon = record.get(11);
              Map<String, Double> geoLocation = new HashMap<>();
              geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat));
              geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon));
              c.setGeoLocation(geoLocation);
              return c;
            });

    SQLContext sqlContext = new SQLContext(context);
    DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class);

    JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection");
  }
Пример #14
0
  public SparkMapReduce(
      final SparkConf conf,
      final String name,
      final IMapperFunction<KEYIN, VALUEIN, K, V> pMapper,
      final IReducerFunction<K, V, KOUT, VOUT> pRetucer,
      IPartitionFunction<K> pPartitioner,
      IKeyValueConsumer<KOUT, VOUT>... pConsumer) {
    setMap(pMapper);
    setReduce(pRetucer);
    setPartitioner(pPartitioner);

    for (int i = 0; i < pConsumer.length; i++) {
      IKeyValueConsumer<KOUT, VOUT> cns = pConsumer[i];
      addConsumer(cns);
    }
    conf.setAppName(name);
  }
Пример #15
0
 public UnsafeExternalSorter(
     TaskMemoryManager memoryManager,
     ShuffleMemoryManager shuffleMemoryManager,
     BlockManager blockManager,
     TaskContext taskContext,
     RecordComparator recordComparator,
     PrefixComparator prefixComparator,
     int initialSize,
     SparkConf conf)
     throws IOException {
   this.memoryManager = memoryManager;
   this.shuffleMemoryManager = shuffleMemoryManager;
   this.blockManager = blockManager;
   this.taskContext = taskContext;
   this.recordComparator = recordComparator;
   this.prefixComparator = prefixComparator;
   this.initialSize = initialSize;
   // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units
   this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
   initializeForWriting();
 }
Пример #16
0
 private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) {
   if (contextOptions.getUsesProvidedSparkContext()) {
     LOG.info("Using a provided Spark Context");
     JavaSparkContext jsc = contextOptions.getProvidedSparkContext();
     if (jsc == null || jsc.sc().isStopped()) {
       LOG.error("The provided Spark context " + jsc + " was not created or was stopped");
       throw new RuntimeException("The provided Spark context was not created or was stopped");
     }
     return jsc;
   } else {
     LOG.info("Creating a brand new Spark Context.");
     SparkConf conf = new SparkConf();
     if (!conf.contains("spark.master")) {
       // set master if not set.
       conf.setMaster(contextOptions.getSparkMaster());
     }
     conf.setAppName(contextOptions.getAppName());
     // register immutable collections serializers because the SDK uses them.
     conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName());
     conf.set("spark.serializer", KryoSerializer.class.getName());
     return new JavaSparkContext(conf);
   }
 }
Пример #17
0
  /**
   * Train on the corpus
   *
   * @param rdd the rdd to train
   * @return the vocab and weights
   */
  public Pair<VocabCache, GloveWeightLookupTable> train(JavaRDD<String> rdd) {
    TextPipeline pipeline = new TextPipeline(rdd);
    final Pair<VocabCache, Long> vocabAndNumWords = pipeline.process();
    SparkConf conf = rdd.context().getConf();
    JavaSparkContext sc = new JavaSparkContext(rdd.context());
    vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst());

    final GloveWeightLookupTable gloveWeightLookupTable =
        new GloveWeightLookupTable.Builder()
            .cache(vocabAndNumWords.getFirst())
            .lr(conf.getDouble(GlovePerformer.ALPHA, 0.025))
            .maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100))
            .vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300))
            .xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75))
            .build();
    gloveWeightLookupTable.resetWeights();

    gloveWeightLookupTable.getBiasAdaGrad().historicalGradient =
        Nd4j.zeros(gloveWeightLookupTable.getSyn0().rows());
    gloveWeightLookupTable.getWeightAdaGrad().historicalGradient =
        Nd4j.create(gloveWeightLookupTable.getSyn0().shape());

    log.info(
        "Created lookup table of size "
            + Arrays.toString(gloveWeightLookupTable.getSyn0().shape()));
    CounterMap<String, String> coOccurrenceCounts =
        rdd.map(new TokenizerFunction(tokenizerFactoryClazz))
            .map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize))
            .fold(new CounterMap<String, String>(), new CoOccurrenceCounts());

    List<Triple<String, String, Double>> counts = new ArrayList<>();
    Iterator<Pair<String, String>> pairIter = coOccurrenceCounts.getPairIterator();
    while (pairIter.hasNext()) {
      Pair<String, String> pair = pairIter.next();
      counts.add(
          new Triple<>(
              pair.getFirst(),
              pair.getSecond(),
              coOccurrenceCounts.getCount(pair.getFirst(), pair.getSecond())));
    }

    log.info("Calculated co occurrences");

    JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts);
    JavaPairRDD<String, Tuple2<String, Double>> pairs =
        parallel.mapToPair(
            new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() {
              @Override
              public Tuple2<String, Tuple2<String, Double>> call(
                  Triple<String, String, Double> stringStringDoubleTriple) throws Exception {
                return new Tuple2<>(
                    stringStringDoubleTriple.getFirst(),
                    new Tuple2<>(
                        stringStringDoubleTriple.getFirst(), stringStringDoubleTriple.getThird()));
              }
            });

    JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab =
        pairs.mapToPair(
            new PairFunction<
                Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() {
              @Override
              public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call(
                  Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception {
                return new Tuple2<>(
                    vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._1()),
                    new Tuple2<>(
                        vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._2()._1()),
                        stringTuple2Tuple2._2()._2()));
              }
            });

    for (int i = 0; i < iterations; i++) {

      JavaRDD<GloveChange> change =
          pairsVocab.map(
              new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() {
                @Override
                public GloveChange call(
                    Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2)
                    throws Exception {
                  VocabWord w1 = vocabWordTuple2Tuple2._1();
                  VocabWord w2 = vocabWordTuple2Tuple2._2()._1();
                  INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex());
                  INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex());
                  INDArray bias = gloveWeightLookupTable.getBias();
                  double score = vocabWordTuple2Tuple2._2()._2();
                  double xMax = gloveWeightLookupTable.getxMax();
                  double maxCount = gloveWeightLookupTable.getMaxCount();
                  // w1 * w2 + bias
                  double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector);
                  prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex());

                  double weight = Math.pow(Math.min(1.0, (score / maxCount)), xMax);

                  double fDiff =
                      score > xMax ? prediction : weight * (prediction - Math.log(score));
                  if (Double.isNaN(fDiff)) fDiff = Nd4j.EPS_THRESHOLD;
                  // amount of change
                  double gradient = fDiff;
                  // update(w1,w1Vector,w2Vector,gradient);
                  // update(w2,w2Vector,w1Vector,gradient);

                  Pair<INDArray, Double> w1Update =
                      update(
                          gloveWeightLookupTable.getWeightAdaGrad(),
                          gloveWeightLookupTable.getBiasAdaGrad(),
                          gloveWeightLookupTable.getSyn0(),
                          gloveWeightLookupTable.getBias(),
                          w1,
                          w1Vector,
                          w2Vector,
                          gradient);
                  Pair<INDArray, Double> w2Update =
                      update(
                          gloveWeightLookupTable.getWeightAdaGrad(),
                          gloveWeightLookupTable.getBiasAdaGrad(),
                          gloveWeightLookupTable.getSyn0(),
                          gloveWeightLookupTable.getBias(),
                          w2,
                          w2Vector,
                          w1Vector,
                          gradient);
                  return new GloveChange(
                      w1,
                      w2,
                      w1Update.getFirst(),
                      w2Update.getFirst(),
                      w1Update.getSecond(),
                      w2Update.getSecond(),
                      fDiff);
                }
              });

      JavaRDD<Double> error =
          change.map(
              new Function<GloveChange, Double>() {
                @Override
                public Double call(GloveChange gloveChange) throws Exception {
                  gloveChange.apply(gloveWeightLookupTable);
                  return gloveChange.getError();
                }
              });

      final Accumulator<Double> d = sc.accumulator(0.0);
      error.foreach(
          new VoidFunction<Double>() {
            @Override
            public void call(Double aDouble) throws Exception {
              d.$plus$eq(aDouble);
            }
          });

      log.info("Error at iteration " + i + " was " + d.value());
    }

    return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable);
  }
Пример #18
0
  /** Main method for performing the random partition based model ensembler evaluation */
  public static void main(String[] args) {

    // Construction of Spark Configuration
    SparkConf sContext = new SparkConf();
    sContext.setMaster("local[4]");
    sContext.setAppName("JavaLR");
    sContext.set("spark.executor.memory", "4G");

    // Creates the spark context
    sc = new JavaSparkContext(sContext); // "local[4]", "JavaLR");

    // Load train and test data
    JavaRDD<String> trainingData =
        readData("/Users/erangap/Documents/ML_Project/datasets/trainImputedNormalized.csv", "Id")
            .sample(false, 0.1, 11L);
    JavaRDD<String> testdata =
        readData("/Users/erangap/Documents/ML_Project/datasets/testImputedNormalized.csv", "Id")
            .sample(false, 0.1, 11L);

    // trainingData.saveAsTextFile("/Users/erangap/Documents/ML_Project/datasets/reduced.csv");
    JavaRDD<LabeledPoint> points = trainingData.map(new ParsePoint());
    // points.persist(StorageLevel.MEMORY_AND_DISK());
    // System.out.println(points.first().features());
    JavaRDD<LabeledPoint> testPoints = testdata.map(new ParsePoint());
    // testPoints.persist(StorageLevel.MEMORY_AND_DISK());

    System.out.println("Total number of records -> " + points.count());

    RandomPartitionedEnSembler ensembler = new RandomPartitionedEnSembler();
    ensembler.setNoofModels(32);
    ensembler.setThreshold(0.499999);

    // Perform the training
    DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
    Date trainStartTime = Calendar.getInstance().getTime();
    String trainStart = dateFormat.format(trainStartTime);
    ensembler.train(points);
    Date trainEndTime = Calendar.getInstance().getTime();
    String trainEnd = dateFormat.format(trainEndTime);

    // Training time calculations and console print
    long trainElapsed = (trainEndTime.getTime() - trainStartTime.getTime()) / 1000;
    System.out.println("Training Started at -> " + trainStart);
    System.out.println("Training Ended at -> " + trainEnd);
    System.out.println("Time Taken to Train -> " + trainElapsed + " Sec.");

    // Prepare data for testing
    JavaRDD<Double> testingLabels =
        testPoints
            .map(
                new Function<LabeledPoint, Double>() {

                  private static final long serialVersionUID = -6597374940461185814L;

                  public Double call(LabeledPoint dataPoint) throws Exception {
                    return dataPoint.label();
                  }
                })
            .cache();
    List<Double> classLabels = testingLabels.toArray();

    // Perform the predictions
    Date predictStartTime = Calendar.getInstance().getTime();
    String predictStart = dateFormat.format(predictStartTime);
    List<Double> predictedLabels = ensembler.voteAndPredit(testPoints).toArray();
    Date predictEndTime = Calendar.getInstance().getTime();
    String predictEnd = dateFormat.format(predictEndTime);

    // Predict time calculations and console print
    long preditElapsed = (predictEndTime.getTime() - predictStartTime.getTime()) / 1000;
    System.out.println("Prediction Started at -> " + predictStart);
    System.out.println("Prediction Ended at -> " + predictEnd);
    System.out.println("Time Taken to Predit -> " + preditElapsed + " Sec.");

    // Calculate and Display the accuracy
    System.out.println("Testing accuracy (%): " + Metrics.accuracy(classLabels, predictedLabels));
    BinaryClassificationMetrics binaryClassificationMetrics =
        getBinaryClassificationMatrix(ensembler, testPoints);
    System.out.println("Area under the curve -> " + binaryClassificationMetrics.areaUnderROC());
  }
Пример #19
0
  public void run() {

    System.setProperty("spark.hadoop.dfs.replication", "2");

    Logger.getLogger("org").setLevel(Level.OFF);
    Logger.getLogger("akka").setLevel(Level.OFF);

    SparkConf conf = new SparkConf().setAppName("WindowingKafkaWordCountWithFaultTolerance");
    conf.set("spark.master", PropertiesStack.getProperty("spark.master"));
    conf.set("spark.executor.memory", PropertiesStack.getProperty("spark.executor.memory"));
    conf.set("spark.driver.memory", PropertiesStack.getProperty("spark.driver.memory"));
    conf.set(
        "spark.driver.maxResultSize", PropertiesStack.getProperty("spark.driver.maxResultSize"));
    // .setAppName("WindowingKafkaWordCountWithoutFaultTolerance");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10));

    HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(PropertiesStack.getKafkaTopic()));

    HashMap<String, String> kafkaParams = new HashMap<String, String>();
    kafkaParams.put("metadata.broker.list", PropertiesStack.getKafkaBootstrapServers());
    kafkaParams.put("zookeeper.connect", PropertiesStack.getZookeeperConnect());
    kafkaParams.put("auto.offset.reset", "smallest");
    kafkaParams.put("group.id", PropertiesStack.getKafkaGroupId());
    kafkaParams.put("auto.commit.enable", "false");

    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    topicMap.put(PropertiesStack.getKafkaTopic(), 1);
    //		Map<kafka.common.TopicAndPartition, java.lang.Long> fromOffsets = new HashMap<>();
    //		fromOffsets.put(new TopicAndPartition(PropertiesStack.getKafkaTopic(),
    //				1), 1000L);
    // Create direct kafka stream with brokers and topics
    //		JavaInputDStream<String> messages = KafkaUtils
    //				.createDirectStream(
    //						jssc,
    //						String.class,
    //						String.class,
    //						StringDecoder.class,
    //						StringDecoder.class,
    //						String.class,
    //						kafkaParams,
    //						fromOffsets,
    //						new Function<kafka.message.MessageAndMetadata<String, String>, String>() {
    //							@Override
    //							public String call(
    //									MessageAndMetadata<String, String> v1)
    //									throws Exception {
    //								return v1.message();
    //							}
    //						});
    JavaPairInputDStream<String, String> messages =
        KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet);
    messages.count().print();
    // .createStream(jssc, PropertiesStack.getZookeeperConnect(),
    // PropertiesStack.getKafkaGroupId(), topicMap);

    // Start the computation
    jssc.start();
    jssc.awaitTermination();
  }
  public static void main(String[] args) throws IOException {
    Parameters param = new Parameters();
    long initTime = System.currentTimeMillis();

    SparkConf conf = new SparkConf().setAppName("StarJoin");
    JavaSparkContext sc = new JavaSparkContext(conf);

    if (param.useKryo) {
      conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
      conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName());
      conf.set("spark.kryoserializer.buffer.mb", param.buffer);
    }

    MyBloomFilter.BloomFilter<String> BFS =
        new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes);
    MyBloomFilter.BloomFilter<String> BFD =
        new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes);
    MyBloomFilter.BloomFilter<String> BFC =
        new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes);

    JavaPairRDD<String, String> supps =
        sc.textFile(param.suppPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> s = supps.collect();
    for (int i = 0; i < s.size(); i++) {
      BFS.add(s.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS);

    JavaPairRDD<String, String> custs =
        sc.textFile(param.custPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> c = custs.collect();
    for (int i = 0; i < c.size(); i++) {
      BFC.add(c.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC);

    JavaPairRDD<String, String> dates =
        sc.textFile(param.datePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[6].equals("Dec1997");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[4]);
                  }
                });

    List<Tuple2<String, String>> d = dates.collect();
    for (int i = 0; i < d.size(); i++) {
      BFD.add(d.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD);

    JavaPairRDD<String, String[]> lines =
        sc.textFile(param.linePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return varC.value().contains(s[2].getBytes())
                        & varS.value().contains(s[4].getBytes())
                        & varD.value().contains(s[5].getBytes());
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String[]>() {
                  public Tuple2<String, String[]> call(String[] s) {
                    String[] v = {s[2], s[5], s[12]};
                    return new Tuple2<String, String[]>(s[4], v);
                  }
                });

    JavaPairRDD<String, String[]> result =
        lines
            .join(supps)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    result =
        result
            .join(custs)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    JavaPairRDD<String, Long> final_result =
        result
            .join(dates)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) {
                    return new Tuple2<String, Long>(
                        s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0]));
                  }
                })
            .reduceByKey(
                new Function2<Long, Long, Long>() {
                  public Long call(Long i1, Long i2) {
                    return i1 + i2;
                  }
                });

    JavaPairRDD<String, String> sub_result =
        final_result.mapToPair(
            new PairFunction<Tuple2<String, Long>, String, String>() {
              public Tuple2<String, String> call(Tuple2<String, Long> line) {
                return new Tuple2(line._1 + "," + line._2.toString(), null);
              }
            });

    final_result =
        sub_result
            .sortByKey(new Q3Comparator())
            .mapToPair(
                new PairFunction<Tuple2<String, String>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, String> line) {
                    String[] s = line._1.split(",");
                    return new Tuple2<String, Long>(
                        s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3]));
                  }
                });

    Configuration HDFSconf = new Configuration();
    FileSystem fs = FileSystem.get(HDFSconf);
    fs.delete(new Path(param.output), true);

    final_result.saveAsTextFile(param.output);

    long finalTime = System.currentTimeMillis();
    System.out.print("Tempo total(ms): ");
    System.out.println(finalTime - initTime);

    sc.close();
  }
Пример #21
0
 public static void create(final String master) {
   final SparkConf sparkConf = new SparkConf();
   sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin");
   sparkConf.setMaster(master);
   CONTEXT = SparkContext.getOrCreate(sparkConf);
 }