public static void main(String[] args) { if (args.length < 4) { System.err.println("Usage: PDCKafkaConsumer <zkQuorum> <group> <topics> <numThreads>"); System.exit(1); } String zkQuorum = args[0]; String kfGrp = args[1]; String[] topics = args[2].split(","); int numThreads = Integer.valueOf(args[3]); Map<String, Integer> topicMap = new HashMap<String, Integer>(); for (String topic : topics) { topicMap.put(topic, numThreads); } SparkConf conf = new SparkConf().setAppName("PDCKafkaConsumer"); conf.set("spark.ui.port", "4040"); JavaStreamingContext ctx = new JavaStreamingContext(conf, new Duration(10000)); JavaPairReceiverInputDStream<String, String> kfStream = KafkaUtils.createStream(ctx, zkQuorum, kfGrp, topicMap); kfStream.saveAsHadoopFiles( "/phasor/pmu/pdc", "in", Text.class, Text.class, TextOutputFormat.class); ctx.start(); ctx.awaitTermination(); }
/** * Merge zero or more spill files together, choosing the fastest merging strategy based on the * number of spills and the IO compression codec. * * @return the partition lengths in the merged file. */ private long[] mergeSpills(SpillInfo[] spills) throws IOException { final File outputFile = shuffleBlockResolver.getDataFile(shuffleId, mapId); final boolean compressionEnabled = sparkConf.getBoolean("spark.shuffle.compress", true); final CompressionCodec compressionCodec = CompressionCodec$.MODULE$.createCodec(sparkConf); final boolean fastMergeEnabled = sparkConf.getBoolean("spark.shuffle.unsafe.fastMergeEnabled", true); final boolean fastMergeIsSupported = !compressionEnabled || compressionCodec instanceof LZFCompressionCodec; try { if (spills.length == 0) { new FileOutputStream(outputFile).close(); // Create an empty file return new long[partitioner.numPartitions()]; } else if (spills.length == 1) { // Here, we don't need to perform any metrics updates because the bytes written to this // output file would have already been counted as shuffle bytes written. Files.move(spills[0].file, outputFile); return spills[0].partitionLengths; } else { final long[] partitionLengths; // There are multiple spills to merge, so none of these spill files' lengths were counted // towards our shuffle write count or shuffle write time. If we use the slow merge path, // then the final output file's size won't necessarily be equal to the sum of the spill // files' sizes. To guard against this case, we look at the output file's actual size when // computing shuffle bytes written. // // We allow the individual merge methods to report their own IO times since different merge // strategies use different IO techniques. We count IO during merge towards the shuffle // shuffle write time, which appears to be consistent with the "not bypassing merge-sort" // branch in ExternalSorter. if (fastMergeEnabled && fastMergeIsSupported) { // Compression is disabled or we are using an IO compression codec that supports // decompression of concatenated compressed streams, so we can perform a fast spill merge // that doesn't need to interpret the spilled bytes. if (transferToEnabled) { logger.debug("Using transferTo-based fast merge"); partitionLengths = mergeSpillsWithTransferTo(spills, outputFile); } else { logger.debug("Using fileStream-based fast merge"); partitionLengths = mergeSpillsWithFileStream(spills, outputFile, null); } } else { logger.debug("Using slow merge"); partitionLengths = mergeSpillsWithFileStream(spills, outputFile, compressionCodec); } // When closing an UnsafeShuffleExternalSorter that has already spilled once but also has // in-memory records, we write out the in-memory records to a file but do not count that // final write as bytes spilled (instead, it's accounted as shuffle write). The merge needs // to be counted as shuffle write, but this will lead to double-counting of the final // SpillInfo's bytes. writeMetrics.decShuffleBytesWritten(spills[spills.length - 1].file.length()); writeMetrics.incShuffleBytesWritten(outputFile.length()); return partitionLengths; } } catch (IOException e) { if (outputFile.exists() && !outputFile.delete()) { logger.error("Unable to delete output file {}", outputFile.getPath()); } throw e; } }
public static void create(final Configuration configuration) { final SparkConf sparkConf = new SparkConf(); configuration .getKeys() .forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString())); sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); CONTEXT = SparkContext.getOrCreate(sparkConf); }
public SparkOperatorCreater(String appName) throws IOException { super(appName); properties = new Properties(); properties.load( this.getClass().getClassLoader().getResourceAsStream("spark-cluster.properties")); SparkConf conf = new SparkConf().setMaster(this.getMaster()).setAppName(appName); conf.set("spark.streaming.ui.retainedBatches", "2000"); jssc = new JavaStreamingContext(conf, Durations.milliseconds(this.getDurationsMilliseconds())); }
public BypassMergeSortShuffleWriter( SparkConf conf, BlockManager blockManager, Partitioner partitioner, ShuffleWriteMetrics writeMetrics, Serializer serializer) { // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024; this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true); this.numPartitions = partitioner.numPartitions(); this.blockManager = blockManager; this.partitioner = partitioner; this.writeMetrics = writeMetrics; this.serializer = serializer; }
@Test public void emptyConfigurationVariablesOnlyForNonSparkProperties() { Properties intpProperty = repl.getProperty(); SparkConf sparkConf = repl.getSparkContext().getConf(); for (Object oKey : intpProperty.keySet()) { String key = (String) oKey; String value = (String) intpProperty.get(key); LOGGER.debug(String.format("[%s]: [%s]", key, value)); if (key.startsWith("spark.") && value.isEmpty()) { assertTrue( String.format("configuration starting from 'spark.' should not be empty. [%s]", key), !sparkConf.contains(key) || !sparkConf.get(key).isEmpty()); } } }
public UnsafeShuffleWriter( BlockManager blockManager, IndexShuffleBlockResolver shuffleBlockResolver, TaskMemoryManager memoryManager, ShuffleMemoryManager shuffleMemoryManager, UnsafeShuffleHandle<K, V> handle, int mapId, TaskContext taskContext, SparkConf sparkConf) throws IOException { final int numPartitions = handle.dependency().partitioner().numPartitions(); if (numPartitions > UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS()) { throw new IllegalArgumentException( "UnsafeShuffleWriter can only be used for shuffles with at most " + UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS() + " reduce partitions"); } this.blockManager = blockManager; this.shuffleBlockResolver = shuffleBlockResolver; this.memoryManager = memoryManager; this.shuffleMemoryManager = shuffleMemoryManager; this.mapId = mapId; final ShuffleDependency<K, V, V> dep = handle.dependency(); this.shuffleId = dep.shuffleId(); this.serializer = Serializer.getSerializer(dep.serializer()).newInstance(); this.partitioner = dep.partitioner(); this.writeMetrics = new ShuffleWriteMetrics(); taskContext.taskMetrics().shuffleWriteMetrics_$eq(Option.apply(writeMetrics)); this.taskContext = taskContext; this.sparkConf = sparkConf; this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true); open(); }
private static JavaStreamingContext createContext(String input, String checkpointDirectory) { System.out.println("Creating new context"); // final File outputFile = new File("/flume_recover"); // if (outputFile.exists()) { // outputFile.delete(); // } SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("Stream File") .set("spark.driver.allowMultipleContexts", "true"); conf.set("spark.serializer", KryoSerializer.class.getName()); conf.set("es.index.auto.create", "true"); conf.set("es.nodes", "10.26.1.134:9200"); conf.set("es.resource", "flume/test"); conf.set("es.input.json", "true"); JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(3000)); jssc.checkpoint(checkpointDirectory); JavaDStream<String> textFile = jssc.textFileStream(input); JavaDStream<String> jsonStr = textFile.map( new Function<String, String>() { public String call(String arg0) throws Exception { Matcher m = log.matcher(arg0); if (m.find()) { return transferJson(m); } return ""; } }); jsonStr.print(); jsonStr.foreach( new Function<JavaRDD<String>, Void>() { public Void call(JavaRDD<String> arg0) throws Exception { if (!arg0.isEmpty() && arg0 != null) { JavaEsSpark.saveToEs(arg0, "flume/test"); } return null; } }); return jssc; }
private static void init() { SparkConf conf = new SparkConf(); conf.setAppName("binend countByValue"); conf.setMaster("spark://localhost:7077"); JavaSparkContext jsc = new JavaSparkContext(conf); jsc.addJar( "/home/titanic/soft/intelijWorkspace/github-spark/com-hadoop-spark/target/com-hadoop-spark-1.0-SNAPSHOT.jar"); List<Integer> list = new ArrayList<Integer>(); for (int x = 0; x <= 10; x++) { list.add(x); } JavaRDD<Integer> rdd = jsc.parallelize(list); Map<Integer, Long> map = rdd.countByValue(); System.out.println(map); }
public static void main(String[] args) throws Exception { Schema schema = new Schema.Builder() .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width") .addColumnInteger("Species") .build(); SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); String directory = new ClassPathResource("IrisData/iris.txt") .getFile() .getParent(); // Normally just define your directory like "file:/..." or "hdfs:/..." JavaRDD<String> stringData = sc.textFile(directory); // We first need to parse this comma-delimited (CSV) format; we can do this using // CSVRecordReader: RecordReader rr = new CSVRecordReader(); JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr)); int maxHistogramBuckets = 10; DataAnalysis dataAnalysis = AnalyzeSpark.analyze(schema, parsedInputData, maxHistogramBuckets); System.out.println(dataAnalysis); // We can get statistics on a per-column basis: DoubleAnalysis da = (DoubleAnalysis) dataAnalysis.getColumnAnalysis("Sepal length"); double minValue = da.getMin(); double maxValue = da.getMax(); double mean = da.getMean(); HtmlAnalysis.createHtmlAnalysisFile(dataAnalysis, new File("DataVecIrisAnalysis.html")); // To write to HDFS instead: // String htmlAnalysisFileContents = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis); // SparkUtils.writeStringToFile("hdfs://your/hdfs/path/here",htmlAnalysisFileContents,sc); }
public static void main(String[] args) { SparkConf conf = new SparkConf() .setMaster("local[1]") .setAppName(RDDParallelizeSample.class.getSimpleName()); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaSparkContext sc = new JavaSparkContext(conf); // create a List of Characters List<Character> characterList = new ArrayList<Character>(); characterList.addAll( Arrays.asList( 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q')); // create an RDD from an internal List using parallelize method JavaRDD<Character> characterRDD = sc.parallelize(characterList); System.out.println("list size : " + characterList.size()); System.out.println("rdd size : " + characterRDD.count()); System.out.println("list content : " + characterList); System.out.println("rdd content : " + characterRDD.collect()); }
@Override public int run(SparkConf conf, CommandLine cli) throws Exception { long startMs = System.currentTimeMillis(); conf.set("spark.ui.enabled", "false"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); long diffMs = (System.currentTimeMillis() - startMs); System.out.println(">> took " + diffMs + " ms to create SQLContext"); Map<String, String> options = new HashMap<>(); options.put("zkhost", "localhost:9983"); options.put("collection", "ml20news"); options.put("query", "content_txt:[* TO *]"); options.put("fields", "content_txt"); DataFrame solrData = sqlContext.read().format("solr").options(options).load(); DataFrame sample = solrData.sample(false, 0.1d, 5150).select("content_txt"); List<Row> rows = sample.collectAsList(); System.out.println(">> loaded " + rows.size() + " docs to classify"); StructType schema = sample.schema(); CrossValidatorModel cvModel = CrossValidatorModel.load("ml-pipeline-model"); PipelineModel bestModel = (PipelineModel) cvModel.bestModel(); int r = 0; startMs = System.currentTimeMillis(); for (Row next : rows) { Row oneRow = RowFactory.create(next.getString(0)); DataFrame oneRowDF = sqlContext.createDataFrame(Collections.<Row>singletonList(oneRow), schema); DataFrame scored = bestModel.transform(oneRowDF); Row scoredRow = scored.collect()[0]; String predictedLabel = scoredRow.getString(scoredRow.fieldIndex("predictedLabel")); // an acutal app would save the predictedLabel // System.out.println(">> for row["+r+"], model returned "+scoredRows.length+" rows, // "+scoredRows[0]); r++; } diffMs = (System.currentTimeMillis() - startMs); System.out.println(">> took " + diffMs + " ms to score " + rows.size() + " docs"); return 0; }
public static void main(String args[]) { SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); conf.set("es.index.auto.create", "true"); JavaSparkContext context = new JavaSparkContext(conf); JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv"); JavaRDD<Crime> dataSplits = textFile.map( line -> { CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180); Crime c = new Crime(); CSVRecord record = parser.getRecords().get(0); c.setId(record.get(0)); c.setCaseNumber(record.get(1)); c.setEventDate(record.get(2)); c.setBlock(record.get(3)); c.setIucr(record.get(4)); c.setPrimaryType(record.get(5)); c.setDescription(record.get(6)); c.setLocation(record.get(7)); c.setArrest(Boolean.parseBoolean(record.get(8))); c.setDomestic(Boolean.parseBoolean(record.get(9))); String lat = record.get(10); String lon = record.get(11); Map<String, Double> geoLocation = new HashMap<>(); geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat)); geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon)); c.setGeoLocation(geoLocation); return c; }); SQLContext sqlContext = new SQLContext(context); DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class); JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection"); }
public SparkMapReduce( final SparkConf conf, final String name, final IMapperFunction<KEYIN, VALUEIN, K, V> pMapper, final IReducerFunction<K, V, KOUT, VOUT> pRetucer, IPartitionFunction<K> pPartitioner, IKeyValueConsumer<KOUT, VOUT>... pConsumer) { setMap(pMapper); setReduce(pRetucer); setPartitioner(pPartitioner); for (int i = 0; i < pConsumer.length; i++) { IKeyValueConsumer<KOUT, VOUT> cns = pConsumer[i]; addConsumer(cns); } conf.setAppName(name); }
public UnsafeExternalSorter( TaskMemoryManager memoryManager, ShuffleMemoryManager shuffleMemoryManager, BlockManager blockManager, TaskContext taskContext, RecordComparator recordComparator, PrefixComparator prefixComparator, int initialSize, SparkConf conf) throws IOException { this.memoryManager = memoryManager; this.shuffleMemoryManager = shuffleMemoryManager; this.blockManager = blockManager; this.taskContext = taskContext; this.recordComparator = recordComparator; this.prefixComparator = prefixComparator; this.initialSize = initialSize; // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024; initializeForWriting(); }
private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) { if (contextOptions.getUsesProvidedSparkContext()) { LOG.info("Using a provided Spark Context"); JavaSparkContext jsc = contextOptions.getProvidedSparkContext(); if (jsc == null || jsc.sc().isStopped()) { LOG.error("The provided Spark context " + jsc + " was not created or was stopped"); throw new RuntimeException("The provided Spark context was not created or was stopped"); } return jsc; } else { LOG.info("Creating a brand new Spark Context."); SparkConf conf = new SparkConf(); if (!conf.contains("spark.master")) { // set master if not set. conf.setMaster(contextOptions.getSparkMaster()); } conf.setAppName(contextOptions.getAppName()); // register immutable collections serializers because the SDK uses them. conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName()); conf.set("spark.serializer", KryoSerializer.class.getName()); return new JavaSparkContext(conf); } }
/** * Train on the corpus * * @param rdd the rdd to train * @return the vocab and weights */ public Pair<VocabCache, GloveWeightLookupTable> train(JavaRDD<String> rdd) { TextPipeline pipeline = new TextPipeline(rdd); final Pair<VocabCache, Long> vocabAndNumWords = pipeline.process(); SparkConf conf = rdd.context().getConf(); JavaSparkContext sc = new JavaSparkContext(rdd.context()); vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst()); final GloveWeightLookupTable gloveWeightLookupTable = new GloveWeightLookupTable.Builder() .cache(vocabAndNumWords.getFirst()) .lr(conf.getDouble(GlovePerformer.ALPHA, 0.025)) .maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100)) .vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300)) .xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75)) .build(); gloveWeightLookupTable.resetWeights(); gloveWeightLookupTable.getBiasAdaGrad().historicalGradient = Nd4j.zeros(gloveWeightLookupTable.getSyn0().rows()); gloveWeightLookupTable.getWeightAdaGrad().historicalGradient = Nd4j.create(gloveWeightLookupTable.getSyn0().shape()); log.info( "Created lookup table of size " + Arrays.toString(gloveWeightLookupTable.getSyn0().shape())); CounterMap<String, String> coOccurrenceCounts = rdd.map(new TokenizerFunction(tokenizerFactoryClazz)) .map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize)) .fold(new CounterMap<String, String>(), new CoOccurrenceCounts()); List<Triple<String, String, Double>> counts = new ArrayList<>(); Iterator<Pair<String, String>> pairIter = coOccurrenceCounts.getPairIterator(); while (pairIter.hasNext()) { Pair<String, String> pair = pairIter.next(); counts.add( new Triple<>( pair.getFirst(), pair.getSecond(), coOccurrenceCounts.getCount(pair.getFirst(), pair.getSecond()))); } log.info("Calculated co occurrences"); JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts); JavaPairRDD<String, Tuple2<String, Double>> pairs = parallel.mapToPair( new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() { @Override public Tuple2<String, Tuple2<String, Double>> call( Triple<String, String, Double> stringStringDoubleTriple) throws Exception { return new Tuple2<>( stringStringDoubleTriple.getFirst(), new Tuple2<>( stringStringDoubleTriple.getFirst(), stringStringDoubleTriple.getThird())); } }); JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab = pairs.mapToPair( new PairFunction< Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() { @Override public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call( Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception { return new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._1()), new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._2()._1()), stringTuple2Tuple2._2()._2())); } }); for (int i = 0; i < iterations; i++) { JavaRDD<GloveChange> change = pairsVocab.map( new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() { @Override public GloveChange call( Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2) throws Exception { VocabWord w1 = vocabWordTuple2Tuple2._1(); VocabWord w2 = vocabWordTuple2Tuple2._2()._1(); INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex()); INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex()); INDArray bias = gloveWeightLookupTable.getBias(); double score = vocabWordTuple2Tuple2._2()._2(); double xMax = gloveWeightLookupTable.getxMax(); double maxCount = gloveWeightLookupTable.getMaxCount(); // w1 * w2 + bias double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector); prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex()); double weight = Math.pow(Math.min(1.0, (score / maxCount)), xMax); double fDiff = score > xMax ? prediction : weight * (prediction - Math.log(score)); if (Double.isNaN(fDiff)) fDiff = Nd4j.EPS_THRESHOLD; // amount of change double gradient = fDiff; // update(w1,w1Vector,w2Vector,gradient); // update(w2,w2Vector,w1Vector,gradient); Pair<INDArray, Double> w1Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w1, w1Vector, w2Vector, gradient); Pair<INDArray, Double> w2Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w2, w2Vector, w1Vector, gradient); return new GloveChange( w1, w2, w1Update.getFirst(), w2Update.getFirst(), w1Update.getSecond(), w2Update.getSecond(), fDiff); } }); JavaRDD<Double> error = change.map( new Function<GloveChange, Double>() { @Override public Double call(GloveChange gloveChange) throws Exception { gloveChange.apply(gloveWeightLookupTable); return gloveChange.getError(); } }); final Accumulator<Double> d = sc.accumulator(0.0); error.foreach( new VoidFunction<Double>() { @Override public void call(Double aDouble) throws Exception { d.$plus$eq(aDouble); } }); log.info("Error at iteration " + i + " was " + d.value()); } return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable); }
/** Main method for performing the random partition based model ensembler evaluation */ public static void main(String[] args) { // Construction of Spark Configuration SparkConf sContext = new SparkConf(); sContext.setMaster("local[4]"); sContext.setAppName("JavaLR"); sContext.set("spark.executor.memory", "4G"); // Creates the spark context sc = new JavaSparkContext(sContext); // "local[4]", "JavaLR"); // Load train and test data JavaRDD<String> trainingData = readData("/Users/erangap/Documents/ML_Project/datasets/trainImputedNormalized.csv", "Id") .sample(false, 0.1, 11L); JavaRDD<String> testdata = readData("/Users/erangap/Documents/ML_Project/datasets/testImputedNormalized.csv", "Id") .sample(false, 0.1, 11L); // trainingData.saveAsTextFile("/Users/erangap/Documents/ML_Project/datasets/reduced.csv"); JavaRDD<LabeledPoint> points = trainingData.map(new ParsePoint()); // points.persist(StorageLevel.MEMORY_AND_DISK()); // System.out.println(points.first().features()); JavaRDD<LabeledPoint> testPoints = testdata.map(new ParsePoint()); // testPoints.persist(StorageLevel.MEMORY_AND_DISK()); System.out.println("Total number of records -> " + points.count()); RandomPartitionedEnSembler ensembler = new RandomPartitionedEnSembler(); ensembler.setNoofModels(32); ensembler.setThreshold(0.499999); // Perform the training DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); Date trainStartTime = Calendar.getInstance().getTime(); String trainStart = dateFormat.format(trainStartTime); ensembler.train(points); Date trainEndTime = Calendar.getInstance().getTime(); String trainEnd = dateFormat.format(trainEndTime); // Training time calculations and console print long trainElapsed = (trainEndTime.getTime() - trainStartTime.getTime()) / 1000; System.out.println("Training Started at -> " + trainStart); System.out.println("Training Ended at -> " + trainEnd); System.out.println("Time Taken to Train -> " + trainElapsed + " Sec."); // Prepare data for testing JavaRDD<Double> testingLabels = testPoints .map( new Function<LabeledPoint, Double>() { private static final long serialVersionUID = -6597374940461185814L; public Double call(LabeledPoint dataPoint) throws Exception { return dataPoint.label(); } }) .cache(); List<Double> classLabels = testingLabels.toArray(); // Perform the predictions Date predictStartTime = Calendar.getInstance().getTime(); String predictStart = dateFormat.format(predictStartTime); List<Double> predictedLabels = ensembler.voteAndPredit(testPoints).toArray(); Date predictEndTime = Calendar.getInstance().getTime(); String predictEnd = dateFormat.format(predictEndTime); // Predict time calculations and console print long preditElapsed = (predictEndTime.getTime() - predictStartTime.getTime()) / 1000; System.out.println("Prediction Started at -> " + predictStart); System.out.println("Prediction Ended at -> " + predictEnd); System.out.println("Time Taken to Predit -> " + preditElapsed + " Sec."); // Calculate and Display the accuracy System.out.println("Testing accuracy (%): " + Metrics.accuracy(classLabels, predictedLabels)); BinaryClassificationMetrics binaryClassificationMetrics = getBinaryClassificationMatrix(ensembler, testPoints); System.out.println("Area under the curve -> " + binaryClassificationMetrics.areaUnderROC()); }
public void run() { System.setProperty("spark.hadoop.dfs.replication", "2"); Logger.getLogger("org").setLevel(Level.OFF); Logger.getLogger("akka").setLevel(Level.OFF); SparkConf conf = new SparkConf().setAppName("WindowingKafkaWordCountWithFaultTolerance"); conf.set("spark.master", PropertiesStack.getProperty("spark.master")); conf.set("spark.executor.memory", PropertiesStack.getProperty("spark.executor.memory")); conf.set("spark.driver.memory", PropertiesStack.getProperty("spark.driver.memory")); conf.set( "spark.driver.maxResultSize", PropertiesStack.getProperty("spark.driver.maxResultSize")); // .setAppName("WindowingKafkaWordCountWithoutFaultTolerance"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10)); HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(PropertiesStack.getKafkaTopic())); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", PropertiesStack.getKafkaBootstrapServers()); kafkaParams.put("zookeeper.connect", PropertiesStack.getZookeeperConnect()); kafkaParams.put("auto.offset.reset", "smallest"); kafkaParams.put("group.id", PropertiesStack.getKafkaGroupId()); kafkaParams.put("auto.commit.enable", "false"); Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put(PropertiesStack.getKafkaTopic(), 1); // Map<kafka.common.TopicAndPartition, java.lang.Long> fromOffsets = new HashMap<>(); // fromOffsets.put(new TopicAndPartition(PropertiesStack.getKafkaTopic(), // 1), 1000L); // Create direct kafka stream with brokers and topics // JavaInputDStream<String> messages = KafkaUtils // .createDirectStream( // jssc, // String.class, // String.class, // StringDecoder.class, // StringDecoder.class, // String.class, // kafkaParams, // fromOffsets, // new Function<kafka.message.MessageAndMetadata<String, String>, String>() { // @Override // public String call( // MessageAndMetadata<String, String> v1) // throws Exception { // return v1.message(); // } // }); JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); messages.count().print(); // .createStream(jssc, PropertiesStack.getZookeeperConnect(), // PropertiesStack.getKafkaGroupId(), topicMap); // Start the computation jssc.start(); jssc.awaitTermination(); }
public static void main(String[] args) throws IOException { Parameters param = new Parameters(); long initTime = System.currentTimeMillis(); SparkConf conf = new SparkConf().setAppName("StarJoin"); JavaSparkContext sc = new JavaSparkContext(conf); if (param.useKryo) { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName()); conf.set("spark.kryoserializer.buffer.mb", param.buffer); } MyBloomFilter.BloomFilter<String> BFS = new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes); MyBloomFilter.BloomFilter<String> BFD = new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes); MyBloomFilter.BloomFilter<String> BFC = new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes); JavaPairRDD<String, String> supps = sc.textFile(param.suppPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> s = supps.collect(); for (int i = 0; i < s.size(); i++) { BFS.add(s.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS); JavaPairRDD<String, String> custs = sc.textFile(param.custPath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[3]); } }); List<Tuple2<String, String>> c = custs.collect(); for (int i = 0; i < c.size(); i++) { BFC.add(c.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC); JavaPairRDD<String, String> dates = sc.textFile(param.datePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return s[6].equals("Dec1997"); } }) .mapToPair( new PairFunction<String[], String, String>() { public Tuple2<String, String> call(String[] s) { return new Tuple2<String, String>(s[0], s[4]); } }); List<Tuple2<String, String>> d = dates.collect(); for (int i = 0; i < d.size(); i++) { BFD.add(d.get(i)._1); } final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD); JavaPairRDD<String, String[]> lines = sc.textFile(param.linePath) .map( new Function<String, String[]>() { public String[] call(String line) { return line.split("\\|"); } }) .filter( new Function<String[], Boolean>() { public Boolean call(String[] s) { return varC.value().contains(s[2].getBytes()) & varS.value().contains(s[4].getBytes()) & varD.value().contains(s[5].getBytes()); } }) .mapToPair( new PairFunction<String[], String, String[]>() { public Tuple2<String, String[]> call(String[] s) { String[] v = {s[2], s[5], s[12]}; return new Tuple2<String, String[]>(s[4], v); } }); JavaPairRDD<String, String[]> result = lines .join(supps) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); result = result .join(custs) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() { public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) { String[] v = {s._2._1[1], s._2._1[2], s._2._2}; return new Tuple2<String, String[]>(s._2._1[0], v); } }); JavaPairRDD<String, Long> final_result = result .join(dates) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) { return new Tuple2<String, Long>( s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0])); } }) .reduceByKey( new Function2<Long, Long, Long>() { public Long call(Long i1, Long i2) { return i1 + i2; } }); JavaPairRDD<String, String> sub_result = final_result.mapToPair( new PairFunction<Tuple2<String, Long>, String, String>() { public Tuple2<String, String> call(Tuple2<String, Long> line) { return new Tuple2(line._1 + "," + line._2.toString(), null); } }); final_result = sub_result .sortByKey(new Q3Comparator()) .mapToPair( new PairFunction<Tuple2<String, String>, String, Long>() { public Tuple2<String, Long> call(Tuple2<String, String> line) { String[] s = line._1.split(","); return new Tuple2<String, Long>( s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3])); } }); Configuration HDFSconf = new Configuration(); FileSystem fs = FileSystem.get(HDFSconf); fs.delete(new Path(param.output), true); final_result.saveAsTextFile(param.output); long finalTime = System.currentTimeMillis(); System.out.print("Tempo total(ms): "); System.out.println(finalTime - initTime); sc.close(); }
public static void create(final String master) { final SparkConf sparkConf = new SparkConf(); sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); sparkConf.setMaster(master); CONTEXT = SparkContext.getOrCreate(sparkConf); }