/**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   * @throws DMLUnsupportedOperationException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String itervar,
      String matrixvar,
      String program,
      String resultFile,
      MatrixObject input,
      ExecutionContext ec,
      PDataPartitionFormat dpf,
      OutputInfo oi,
      boolean tSparseCol, // config params
      boolean enableCPCaching,
      int numReducers) // opt params
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    String jobname = "ParFor-DPESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();

    // prepare input parameters
    MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData();
    MatrixCharacteristics mc = md.getMatrixCharacteristics();
    InputInfo ii = InputInfo.BinaryBlockInputInfo;

    // initialize accumulators for tasks/iterations
    Accumulator<Integer> aTasks = sc.accumulator(0);
    Accumulator<Integer> aIters = sc.accumulator(0);

    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
    DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf);
    RemoteDPParForSparkWorker efun =
        new RemoteDPParForSparkWorker(
            program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters);
    List<Tuple2<Long, String>> out =
        in.flatMapToPair(dpfun) // partition the input blocks
            .groupByKey(numReducers) // group partition blocks 		
            .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup
            .collect(); // get output handles

    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    int numTasks = aTasks.value(); // get accumulator value
    int numIters = aIters.value(); // get accumulator value

    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);

    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
      Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }

    return ret;
  }
Esempio n. 2
0
  /**
   * @param pfid
   * @param program
   * @param tasks
   * @param ec
   * @param enableCPCaching
   * @param numMappers
   * @return
   * @throws DMLRuntimeException
   * @throws DMLUnsupportedOperationException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String program,
      List<Task> tasks,
      ExecutionContext ec,
      boolean cpCaching,
      int numMappers)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    String jobname = "ParFor-ESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();

    // initialize accumulators for tasks/iterations
    Accumulator<Integer> aTasks = sc.accumulator(0);
    Accumulator<Integer> aIters = sc.accumulator(0);

    // run remote_spark parfor job
    // (w/o lazy evaluation to fit existing parfor framework, e.g., result merge)
    RemoteParForSparkWorker func = new RemoteParForSparkWorker(program, cpCaching, aTasks, aIters);
    List<Tuple2<Long, String>> out =
        sc.parallelize(tasks, numMappers) // create rdd of parfor tasks
            .flatMapToPair(func) // execute parfor tasks
            .collect(); // get output handles

    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    int numTasks = aTasks.value(); // get accumulator value
    int numIters = aIters.value(); // get accumulator value

    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);

    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
      Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }

    return ret;
  }
Esempio n. 3
0
  @Test
  public void testForeach() {
    final Accumulator<Integer> accum = jsc.accumulator(0);
    List<String> data = Arrays.asList("a", "b", "c");
    Dataset<String> ds = context.createDataset(data, Encoders.STRING());

    ds.foreach(
        new ForeachFunction<String>() {
          @Override
          public void call(String s) throws Exception {
            accum.add(1);
          }
        });
    Assert.assertEquals(3, accum.value().intValue());
  }
Esempio n. 4
0
  /**
   * Train on the corpus
   *
   * @param rdd the rdd to train
   * @return the vocab and weights
   */
  public Pair<VocabCache, GloveWeightLookupTable> train(JavaRDD<String> rdd) {
    TextPipeline pipeline = new TextPipeline(rdd);
    final Pair<VocabCache, Long> vocabAndNumWords = pipeline.process();
    SparkConf conf = rdd.context().getConf();
    JavaSparkContext sc = new JavaSparkContext(rdd.context());
    vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst());

    final GloveWeightLookupTable gloveWeightLookupTable =
        new GloveWeightLookupTable.Builder()
            .cache(vocabAndNumWords.getFirst())
            .lr(conf.getDouble(GlovePerformer.ALPHA, 0.025))
            .maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100))
            .vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300))
            .xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75))
            .build();
    gloveWeightLookupTable.resetWeights();

    gloveWeightLookupTable.getBiasAdaGrad().historicalGradient =
        Nd4j.zeros(gloveWeightLookupTable.getSyn0().rows());
    gloveWeightLookupTable.getWeightAdaGrad().historicalGradient =
        Nd4j.create(gloveWeightLookupTable.getSyn0().shape());

    log.info(
        "Created lookup table of size "
            + Arrays.toString(gloveWeightLookupTable.getSyn0().shape()));
    CounterMap<String, String> coOccurrenceCounts =
        rdd.map(new TokenizerFunction(tokenizerFactoryClazz))
            .map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize))
            .fold(new CounterMap<String, String>(), new CoOccurrenceCounts());

    List<Triple<String, String, Double>> counts = new ArrayList<>();
    Iterator<Pair<String, String>> pairIter = coOccurrenceCounts.getPairIterator();
    while (pairIter.hasNext()) {
      Pair<String, String> pair = pairIter.next();
      counts.add(
          new Triple<>(
              pair.getFirst(),
              pair.getSecond(),
              coOccurrenceCounts.getCount(pair.getFirst(), pair.getSecond())));
    }

    log.info("Calculated co occurrences");

    JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts);
    JavaPairRDD<String, Tuple2<String, Double>> pairs =
        parallel.mapToPair(
            new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() {
              @Override
              public Tuple2<String, Tuple2<String, Double>> call(
                  Triple<String, String, Double> stringStringDoubleTriple) throws Exception {
                return new Tuple2<>(
                    stringStringDoubleTriple.getFirst(),
                    new Tuple2<>(
                        stringStringDoubleTriple.getFirst(), stringStringDoubleTriple.getThird()));
              }
            });

    JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab =
        pairs.mapToPair(
            new PairFunction<
                Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() {
              @Override
              public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call(
                  Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception {
                return new Tuple2<>(
                    vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._1()),
                    new Tuple2<>(
                        vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._2()._1()),
                        stringTuple2Tuple2._2()._2()));
              }
            });

    for (int i = 0; i < iterations; i++) {

      JavaRDD<GloveChange> change =
          pairsVocab.map(
              new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() {
                @Override
                public GloveChange call(
                    Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2)
                    throws Exception {
                  VocabWord w1 = vocabWordTuple2Tuple2._1();
                  VocabWord w2 = vocabWordTuple2Tuple2._2()._1();
                  INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex());
                  INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex());
                  INDArray bias = gloveWeightLookupTable.getBias();
                  double score = vocabWordTuple2Tuple2._2()._2();
                  double xMax = gloveWeightLookupTable.getxMax();
                  double maxCount = gloveWeightLookupTable.getMaxCount();
                  // w1 * w2 + bias
                  double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector);
                  prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex());

                  double weight = Math.pow(Math.min(1.0, (score / maxCount)), xMax);

                  double fDiff =
                      score > xMax ? prediction : weight * (prediction - Math.log(score));
                  if (Double.isNaN(fDiff)) fDiff = Nd4j.EPS_THRESHOLD;
                  // amount of change
                  double gradient = fDiff;
                  // update(w1,w1Vector,w2Vector,gradient);
                  // update(w2,w2Vector,w1Vector,gradient);

                  Pair<INDArray, Double> w1Update =
                      update(
                          gloveWeightLookupTable.getWeightAdaGrad(),
                          gloveWeightLookupTable.getBiasAdaGrad(),
                          gloveWeightLookupTable.getSyn0(),
                          gloveWeightLookupTable.getBias(),
                          w1,
                          w1Vector,
                          w2Vector,
                          gradient);
                  Pair<INDArray, Double> w2Update =
                      update(
                          gloveWeightLookupTable.getWeightAdaGrad(),
                          gloveWeightLookupTable.getBiasAdaGrad(),
                          gloveWeightLookupTable.getSyn0(),
                          gloveWeightLookupTable.getBias(),
                          w2,
                          w2Vector,
                          w1Vector,
                          gradient);
                  return new GloveChange(
                      w1,
                      w2,
                      w1Update.getFirst(),
                      w2Update.getFirst(),
                      w1Update.getSecond(),
                      w2Update.getSecond(),
                      fDiff);
                }
              });

      JavaRDD<Double> error =
          change.map(
              new Function<GloveChange, Double>() {
                @Override
                public Double call(GloveChange gloveChange) throws Exception {
                  gloveChange.apply(gloveWeightLookupTable);
                  return gloveChange.getError();
                }
              });

      final Accumulator<Double> d = sc.accumulator(0.0);
      error.foreach(
          new VoidFunction<Double>() {
            @Override
            public void call(Double aDouble) throws Exception {
              d.$plus$eq(aDouble);
            }
          });

      log.info("Error at iteration " + i + " was " + d.value());
    }

    return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable);
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    // get filename (literal or variable expression)
    String fname =
        ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue();

    try {
      // if the file already exists on HDFS, remove it.
      MapReduceTool.deleteFileIfExistOnHDFS(fname);

      // prepare output info according to meta data
      String outFmt = input3.getName();
      OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt);

      // get input rdd
      JavaPairRDD<MatrixIndexes, MatrixBlock> in1 =
          sec.getBinaryBlockRDDHandleForVariable(input1.getName());
      MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());

      if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
        // recompute nnz if necessary (required for header if matrix market)
        if (isInputMatrixBlock && !mc.nnzKnown())
          mc.setNonZeros(SparkUtils.computeNNZFromBlocks(in1));

        JavaRDD<String> header = null;
        if (outFmt.equalsIgnoreCase("matrixmarket")) {
          ArrayList<String> headerContainer = new ArrayList<String>(1);
          // First output MM header
          String headerStr =
              "%%MatrixMarket matrix coordinate real general\n"
                  +
                  // output number of rows, number of columns and number of nnz
                  mc.getRows()
                  + " "
                  + mc.getCols()
                  + " "
                  + mc.getNonZeros();
          headerContainer.add(headerStr);
          header = sec.getSparkContext().parallelize(headerContainer);
        }

        JavaRDD<String> ijv =
            in1.flatMap(
                new ConvertMatrixBlockToIJVLines(mc.getRowsPerBlock(), mc.getColsPerBlock()));
        if (header != null) customSaveTextFile(header.union(ijv), fname, true);
        else customSaveTextFile(ijv, fname, false);
      } else if (oi == OutputInfo.CSVOutputInfo) {
        JavaRDD<String> out = null;
        Accumulator<Double> aNnz = null;

        if (isInputMatrixBlock) {
          // piggyback nnz computation on actual write
          if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().accumulator(0L);
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
          }

          out =
              RDDConverterUtils.binaryBlockToCsv(
                  in1, mc, (CSVFileFormatProperties) formatProperties, true);
        } else {
          // This case is applicable when the CSV output from transform() is written out
          @SuppressWarnings("unchecked")
          JavaPairRDD<Long, String> rdd =
              (JavaPairRDD<Long, String>)
                  ((MatrixObject) sec.getVariable(input1.getName())).getRDDHandle().getRDD();
          out = rdd.values();

          String sep = ",";
          boolean hasHeader = false;
          if (formatProperties != null) {
            sep = ((CSVFileFormatProperties) formatProperties).getDelim();
            hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
          }

          if (hasHeader) {
            StringBuffer buf = new StringBuffer();
            for (int j = 1; j < mc.getCols(); j++) {
              if (j != 1) {
                buf.append(sep);
              }
              buf.append("C" + j);
            }
            ArrayList<String> headerContainer = new ArrayList<String>(1);
            headerContainer.add(0, buf.toString());
            JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
            out = header.union(out);
          }
        }

        customSaveTextFile(out, fname, false);

        if (isInputMatrixBlock && !mc.nnzKnown()) mc.setNonZeros((long) aNnz.value().longValue());
      } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        // piggyback nnz computation on actual write
        Accumulator<Double> aNnz = null;
        if (!mc.nnzKnown()) {
          aNnz = sec.getSparkContext().accumulator(0L);
          in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }

        // save binary block rdd on hdfs
        in1.saveAsHadoopFile(
            fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);

        if (!mc.nnzKnown()) mc.setNonZeros((long) aNnz.value().longValue());
      } else {
        // unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + outFmt);
      }

      // write meta data file
      MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
    } catch (IOException ex) {
      throw new DMLRuntimeException("Failed to process write instruction", ex);
    }
  }