/** * @param pfid * @param program * @param taskFile * @param resultFile * @param enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException * @throws DMLUnsupportedOperationException */ public static RemoteParForJobReturn runJob( long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, ExecutionContext ec, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, // config params boolean enableCPCaching, int numReducers) // opt params throws DMLRuntimeException, DMLUnsupportedOperationException { String jobname = "ParFor-DPESP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext) ec; JavaSparkContext sc = sec.getSparkContext(); // prepare input parameters MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData(); MatrixCharacteristics mc = md.getMatrixCharacteristics(); InputInfo ii = InputInfo.BinaryBlockInputInfo; // initialize accumulators for tasks/iterations Accumulator<Integer> aTasks = sc.accumulator(0); Accumulator<Integer> aIters = sc.accumulator(0); JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar); DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf); RemoteDPParForSparkWorker efun = new RemoteDPParForSparkWorker( program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters); List<Tuple2<Long, String>> out = in.flatMapToPair(dpfun) // partition the input blocks .groupByKey(numReducers) // group partition blocks .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup .collect(); // get output handles // de-serialize results LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG); int numTasks = aTasks.value(); // get accumulator value int numIters = aIters.value(); // get accumulator value // create output symbol table entries RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results); // maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if (DMLScript.STATISTICS) { Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0); } return ret; }
@Test public void testForeach() { final Accumulator<Integer> accum = jsc.accumulator(0); List<String> data = Arrays.asList("a", "b", "c"); Dataset<String> ds = context.createDataset(data, Encoders.STRING()); ds.foreach( new ForeachFunction<String>() { @Override public void call(String s) throws Exception { accum.add(1); } }); Assert.assertEquals(3, accum.value().intValue()); }
/** * @param pfid * @param program * @param tasks * @param ec * @param enableCPCaching * @param numMappers * @return * @throws DMLRuntimeException * @throws DMLUnsupportedOperationException */ public static RemoteParForJobReturn runJob( long pfid, String program, List<Task> tasks, ExecutionContext ec, boolean cpCaching, int numMappers) throws DMLRuntimeException, DMLUnsupportedOperationException { String jobname = "ParFor-ESP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext) ec; JavaSparkContext sc = sec.getSparkContext(); // initialize accumulators for tasks/iterations Accumulator<Integer> aTasks = sc.accumulator(0); Accumulator<Integer> aIters = sc.accumulator(0); // run remote_spark parfor job // (w/o lazy evaluation to fit existing parfor framework, e.g., result merge) RemoteParForSparkWorker func = new RemoteParForSparkWorker(program, cpCaching, aTasks, aIters); List<Tuple2<Long, String>> out = sc.parallelize(tasks, numMappers) // create rdd of parfor tasks .flatMapToPair(func) // execute parfor tasks .collect(); // get output handles // de-serialize results LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG); int numTasks = aTasks.value(); // get accumulator value int numIters = aIters.value(); // get accumulator value // create output symbol table entries RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results); // maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if (DMLScript.STATISTICS) { Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0); } return ret; }
@Override public MatrixBlock call(MatrixBlock arg0) throws Exception { _aNnz.add((double) arg0.getNonZeros()); return arg0; }
/** * Train on the corpus * * @param rdd the rdd to train * @return the vocab and weights */ public Pair<VocabCache, GloveWeightLookupTable> train(JavaRDD<String> rdd) { TextPipeline pipeline = new TextPipeline(rdd); final Pair<VocabCache, Long> vocabAndNumWords = pipeline.process(); SparkConf conf = rdd.context().getConf(); JavaSparkContext sc = new JavaSparkContext(rdd.context()); vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst()); final GloveWeightLookupTable gloveWeightLookupTable = new GloveWeightLookupTable.Builder() .cache(vocabAndNumWords.getFirst()) .lr(conf.getDouble(GlovePerformer.ALPHA, 0.025)) .maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100)) .vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300)) .xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75)) .build(); gloveWeightLookupTable.resetWeights(); gloveWeightLookupTable.getBiasAdaGrad().historicalGradient = Nd4j.zeros(gloveWeightLookupTable.getSyn0().rows()); gloveWeightLookupTable.getWeightAdaGrad().historicalGradient = Nd4j.create(gloveWeightLookupTable.getSyn0().shape()); log.info( "Created lookup table of size " + Arrays.toString(gloveWeightLookupTable.getSyn0().shape())); CounterMap<String, String> coOccurrenceCounts = rdd.map(new TokenizerFunction(tokenizerFactoryClazz)) .map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize)) .fold(new CounterMap<String, String>(), new CoOccurrenceCounts()); List<Triple<String, String, Double>> counts = new ArrayList<>(); Iterator<Pair<String, String>> pairIter = coOccurrenceCounts.getPairIterator(); while (pairIter.hasNext()) { Pair<String, String> pair = pairIter.next(); counts.add( new Triple<>( pair.getFirst(), pair.getSecond(), coOccurrenceCounts.getCount(pair.getFirst(), pair.getSecond()))); } log.info("Calculated co occurrences"); JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts); JavaPairRDD<String, Tuple2<String, Double>> pairs = parallel.mapToPair( new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() { @Override public Tuple2<String, Tuple2<String, Double>> call( Triple<String, String, Double> stringStringDoubleTriple) throws Exception { return new Tuple2<>( stringStringDoubleTriple.getFirst(), new Tuple2<>( stringStringDoubleTriple.getFirst(), stringStringDoubleTriple.getThird())); } }); JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab = pairs.mapToPair( new PairFunction< Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() { @Override public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call( Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception { return new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._1()), new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._2()._1()), stringTuple2Tuple2._2()._2())); } }); for (int i = 0; i < iterations; i++) { JavaRDD<GloveChange> change = pairsVocab.map( new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() { @Override public GloveChange call( Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2) throws Exception { VocabWord w1 = vocabWordTuple2Tuple2._1(); VocabWord w2 = vocabWordTuple2Tuple2._2()._1(); INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex()); INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex()); INDArray bias = gloveWeightLookupTable.getBias(); double score = vocabWordTuple2Tuple2._2()._2(); double xMax = gloveWeightLookupTable.getxMax(); double maxCount = gloveWeightLookupTable.getMaxCount(); // w1 * w2 + bias double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector); prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex()); double weight = Math.pow(Math.min(1.0, (score / maxCount)), xMax); double fDiff = score > xMax ? prediction : weight * (prediction - Math.log(score)); if (Double.isNaN(fDiff)) fDiff = Nd4j.EPS_THRESHOLD; // amount of change double gradient = fDiff; // update(w1,w1Vector,w2Vector,gradient); // update(w2,w2Vector,w1Vector,gradient); Pair<INDArray, Double> w1Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w1, w1Vector, w2Vector, gradient); Pair<INDArray, Double> w2Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w2, w2Vector, w1Vector, gradient); return new GloveChange( w1, w2, w1Update.getFirst(), w2Update.getFirst(), w1Update.getSecond(), w2Update.getSecond(), fDiff); } }); JavaRDD<Double> error = change.map( new Function<GloveChange, Double>() { @Override public Double call(GloveChange gloveChange) throws Exception { gloveChange.apply(gloveWeightLookupTable); return gloveChange.getError(); } }); final Accumulator<Double> d = sc.accumulator(0.0); error.foreach( new VoidFunction<Double>() { @Override public void call(Double aDouble) throws Exception { d.$plus$eq(aDouble); } }); log.info("Error at iteration " + i + " was " + d.value()); } return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable); }
@Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException, DMLUnsupportedOperationException { SparkExecutionContext sec = (SparkExecutionContext) ec; // get filename (literal or variable expression) String fname = ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue(); try { // if the file already exists on HDFS, remove it. MapReduceTool.deleteFileIfExistOnHDFS(fname); // prepare output info according to meta data String outFmt = input3.getName(); OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt); // get input rdd JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName()); MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName()); if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) { // recompute nnz if necessary (required for header if matrix market) if (isInputMatrixBlock && !mc.nnzKnown()) mc.setNonZeros(SparkUtils.computeNNZFromBlocks(in1)); JavaRDD<String> header = null; if (outFmt.equalsIgnoreCase("matrixmarket")) { ArrayList<String> headerContainer = new ArrayList<String>(1); // First output MM header String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros(); headerContainer.add(headerStr); header = sec.getSparkContext().parallelize(headerContainer); } JavaRDD<String> ijv = in1.flatMap( new ConvertMatrixBlockToIJVLines(mc.getRowsPerBlock(), mc.getColsPerBlock())); if (header != null) customSaveTextFile(header.union(ijv), fname, true); else customSaveTextFile(ijv, fname, false); } else if (oi == OutputInfo.CSVOutputInfo) { JavaRDD<String> out = null; Accumulator<Double> aNnz = null; if (isInputMatrixBlock) { // piggyback nnz computation on actual write if (!mc.nnzKnown()) { aNnz = sec.getSparkContext().accumulator(0L); in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz)); } out = RDDConverterUtils.binaryBlockToCsv( in1, mc, (CSVFileFormatProperties) formatProperties, true); } else { // This case is applicable when the CSV output from transform() is written out @SuppressWarnings("unchecked") JavaPairRDD<Long, String> rdd = (JavaPairRDD<Long, String>) ((MatrixObject) sec.getVariable(input1.getName())).getRDDHandle().getRDD(); out = rdd.values(); String sep = ","; boolean hasHeader = false; if (formatProperties != null) { sep = ((CSVFileFormatProperties) formatProperties).getDelim(); hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader(); } if (hasHeader) { StringBuffer buf = new StringBuffer(); for (int j = 1; j < mc.getCols(); j++) { if (j != 1) { buf.append(sep); } buf.append("C" + j); } ArrayList<String> headerContainer = new ArrayList<String>(1); headerContainer.add(0, buf.toString()); JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer); out = header.union(out); } } customSaveTextFile(out, fname, false); if (isInputMatrixBlock && !mc.nnzKnown()) mc.setNonZeros((long) aNnz.value().longValue()); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { // piggyback nnz computation on actual write Accumulator<Double> aNnz = null; if (!mc.nnzKnown()) { aNnz = sec.getSparkContext().accumulator(0L); in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz)); } // save binary block rdd on hdfs in1.saveAsHadoopFile( fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); if (!mc.nnzKnown()) mc.setNonZeros((long) aNnz.value().longValue()); } else { // unsupported formats: binarycell (not externalized) throw new DMLRuntimeException("Unexpected data format: " + outFmt); } // write meta data file MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties); } catch (IOException ex) { throw new DMLRuntimeException("Failed to process write instruction", ex); } }