public static void main(String[] args) { // Create a java spark context SparkConf conf = new SparkConf().setAppName("Accumulators"); JavaSparkContext sc = new JavaSparkContext(conf); // Create an accumulator to keep track of number of blank lines in callSigns.txt final Accumulator<Integer> blankLines = sc.accumulator(0); JavaRDD<String> input = sc.textFile("src/main/resources/callSigns.txt"); JavaRDD<String> callSigns = input.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) throws Exception { if (s.equals("")) { blankLines.add(1); } return Arrays.asList(s.split(" ")); } }); callSigns.saveAsTextFile("Chapter5-Output"); System.out.println("Number of blank lines present in text file : " + blankLines); }
public SparkRuntime( SparkPipeline pipeline, JavaSparkContext sparkContext, Configuration conf, Map<PCollectionImpl<?>, Set<Target>> outputTargets, Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize, Map<PCollection<?>, StorageLevel> toCache, Map<PipelineCallable<?>, Set<Target>> allPipelineCallables) { this.pipeline = pipeline; this.sparkContext = sparkContext; this.conf = conf; this.counters = sparkContext.accumulator( Maps.<String, Map<String, Long>>newHashMap(), new CounterAccumulatorParam()); this.ctxt = new SparkRuntimeContext( sparkContext.appName(), counters, sparkContext.broadcast(WritableUtils.toByteArray(conf))); this.outputTargets = Maps.newTreeMap(DEPTH_COMPARATOR); this.outputTargets.putAll(outputTargets); this.toMaterialize = toMaterialize; this.toCache = toCache; this.allPipelineCallables = allPipelineCallables; this.activePipelineCallables = allPipelineCallables.keySet(); this.status.set(Status.READY); this.monitorThread = new Thread( new Runnable() { @Override public void run() { monitorLoop(); } }); }
/** * @param pfid * @param program * @param taskFile * @param resultFile * @param enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException * @throws DMLUnsupportedOperationException */ public static RemoteParForJobReturn runJob( long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, ExecutionContext ec, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, // config params boolean enableCPCaching, int numReducers) // opt params throws DMLRuntimeException, DMLUnsupportedOperationException { String jobname = "ParFor-DPESP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext) ec; JavaSparkContext sc = sec.getSparkContext(); // prepare input parameters MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData(); MatrixCharacteristics mc = md.getMatrixCharacteristics(); InputInfo ii = InputInfo.BinaryBlockInputInfo; // initialize accumulators for tasks/iterations Accumulator<Integer> aTasks = sc.accumulator(0); Accumulator<Integer> aIters = sc.accumulator(0); JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar); DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf); RemoteDPParForSparkWorker efun = new RemoteDPParForSparkWorker( program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters); List<Tuple2<Long, String>> out = in.flatMapToPair(dpfun) // partition the input blocks .groupByKey(numReducers) // group partition blocks .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup .collect(); // get output handles // de-serialize results LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG); int numTasks = aTasks.value(); // get accumulator value int numIters = aIters.value(); // get accumulator value // create output symbol table entries RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results); // maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if (DMLScript.STATISTICS) { Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0); } return ret; }
@Test public void testForeach() { final Accumulator<Integer> accum = jsc.accumulator(0); List<String> data = Arrays.asList("a", "b", "c"); Dataset<String> ds = context.createDataset(data, Encoders.STRING()); ds.foreach( new ForeachFunction<String>() { @Override public void call(String s) throws Exception { accum.add(1); } }); Assert.assertEquals(3, accum.value().intValue()); }
/** * @param pfid * @param program * @param tasks * @param ec * @param enableCPCaching * @param numMappers * @return * @throws DMLRuntimeException * @throws DMLUnsupportedOperationException */ public static RemoteParForJobReturn runJob( long pfid, String program, List<Task> tasks, ExecutionContext ec, boolean cpCaching, int numMappers) throws DMLRuntimeException, DMLUnsupportedOperationException { String jobname = "ParFor-ESP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext) ec; JavaSparkContext sc = sec.getSparkContext(); // initialize accumulators for tasks/iterations Accumulator<Integer> aTasks = sc.accumulator(0); Accumulator<Integer> aIters = sc.accumulator(0); // run remote_spark parfor job // (w/o lazy evaluation to fit existing parfor framework, e.g., result merge) RemoteParForSparkWorker func = new RemoteParForSparkWorker(program, cpCaching, aTasks, aIters); List<Tuple2<Long, String>> out = sc.parallelize(tasks, numMappers) // create rdd of parfor tasks .flatMapToPair(func) // execute parfor tasks .collect(); // get output handles // de-serialize results LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG); int numTasks = aTasks.value(); // get accumulator value int numIters = aIters.value(); // get accumulator value // create output symbol table entries RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results); // maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if (DMLScript.STATISTICS) { Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0); } return ret; }
/** * Train on the corpus * * @param rdd the rdd to train * @return the vocab and weights */ public Pair<VocabCache, GloveWeightLookupTable> train(JavaRDD<String> rdd) { TextPipeline pipeline = new TextPipeline(rdd); final Pair<VocabCache, Long> vocabAndNumWords = pipeline.process(); SparkConf conf = rdd.context().getConf(); JavaSparkContext sc = new JavaSparkContext(rdd.context()); vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst()); final GloveWeightLookupTable gloveWeightLookupTable = new GloveWeightLookupTable.Builder() .cache(vocabAndNumWords.getFirst()) .lr(conf.getDouble(GlovePerformer.ALPHA, 0.025)) .maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100)) .vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300)) .xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75)) .build(); gloveWeightLookupTable.resetWeights(); gloveWeightLookupTable.getBiasAdaGrad().historicalGradient = Nd4j.zeros(gloveWeightLookupTable.getSyn0().rows()); gloveWeightLookupTable.getWeightAdaGrad().historicalGradient = Nd4j.create(gloveWeightLookupTable.getSyn0().shape()); log.info( "Created lookup table of size " + Arrays.toString(gloveWeightLookupTable.getSyn0().shape())); CounterMap<String, String> coOccurrenceCounts = rdd.map(new TokenizerFunction(tokenizerFactoryClazz)) .map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize)) .fold(new CounterMap<String, String>(), new CoOccurrenceCounts()); List<Triple<String, String, Double>> counts = new ArrayList<>(); Iterator<Pair<String, String>> pairIter = coOccurrenceCounts.getPairIterator(); while (pairIter.hasNext()) { Pair<String, String> pair = pairIter.next(); counts.add( new Triple<>( pair.getFirst(), pair.getSecond(), coOccurrenceCounts.getCount(pair.getFirst(), pair.getSecond()))); } log.info("Calculated co occurrences"); JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts); JavaPairRDD<String, Tuple2<String, Double>> pairs = parallel.mapToPair( new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() { @Override public Tuple2<String, Tuple2<String, Double>> call( Triple<String, String, Double> stringStringDoubleTriple) throws Exception { return new Tuple2<>( stringStringDoubleTriple.getFirst(), new Tuple2<>( stringStringDoubleTriple.getFirst(), stringStringDoubleTriple.getThird())); } }); JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab = pairs.mapToPair( new PairFunction< Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() { @Override public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call( Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception { return new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._1()), new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._2()._1()), stringTuple2Tuple2._2()._2())); } }); for (int i = 0; i < iterations; i++) { JavaRDD<GloveChange> change = pairsVocab.map( new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() { @Override public GloveChange call( Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2) throws Exception { VocabWord w1 = vocabWordTuple2Tuple2._1(); VocabWord w2 = vocabWordTuple2Tuple2._2()._1(); INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex()); INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex()); INDArray bias = gloveWeightLookupTable.getBias(); double score = vocabWordTuple2Tuple2._2()._2(); double xMax = gloveWeightLookupTable.getxMax(); double maxCount = gloveWeightLookupTable.getMaxCount(); // w1 * w2 + bias double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector); prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex()); double weight = Math.pow(Math.min(1.0, (score / maxCount)), xMax); double fDiff = score > xMax ? prediction : weight * (prediction - Math.log(score)); if (Double.isNaN(fDiff)) fDiff = Nd4j.EPS_THRESHOLD; // amount of change double gradient = fDiff; // update(w1,w1Vector,w2Vector,gradient); // update(w2,w2Vector,w1Vector,gradient); Pair<INDArray, Double> w1Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w1, w1Vector, w2Vector, gradient); Pair<INDArray, Double> w2Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w2, w2Vector, w1Vector, gradient); return new GloveChange( w1, w2, w1Update.getFirst(), w2Update.getFirst(), w1Update.getSecond(), w2Update.getSecond(), fDiff); } }); JavaRDD<Double> error = change.map( new Function<GloveChange, Double>() { @Override public Double call(GloveChange gloveChange) throws Exception { gloveChange.apply(gloveWeightLookupTable); return gloveChange.getError(); } }); final Accumulator<Double> d = sc.accumulator(0.0); error.foreach( new VoidFunction<Double>() { @Override public void call(Double aDouble) throws Exception { d.$plus$eq(aDouble); } }); log.info("Error at iteration " + i + " was " + d.value()); } return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable); }
private void setup() { // Set up accumulators and broadcast stopwords this.sc = new JavaSparkContext(corpusRDD.context()); this.wordFreqAcc = sc.accumulator(new Counter<String>(), new WordFreqAccumulator()); this.stopWordBroadCast = sc.broadcast(stopWords); }