/**
   * @param rdd
   * @param fname
   * @param inSingleFile
   * @throws DMLRuntimeException
   */
  private void customSaveTextFile(JavaRDD<String> rdd, String fname, boolean inSingleFile)
      throws DMLRuntimeException {
    if (inSingleFile) {
      Random rand = new Random();
      String randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong();
      try {
        while (MapReduceTool.existsFileOnHDFS(randFName)) {
          randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong();
        }

        rdd.saveAsTextFile(randFName);
        MapReduceTool.mergeIntoSingleFile(randFName, fname); // Faster version :)

        // rdd.coalesce(1, true).saveAsTextFile(randFName);
        // MapReduceTool.copyFileOnHDFS(randFName + "/part-00000", fname);
      } catch (IOException e) {
        throw new DMLRuntimeException(
            "Cannot merge the output into single file: " + e.getMessage());
      } finally {
        try {
          // This is to make sure that we donot create random files on HDFS
          MapReduceTool.deleteFileIfExistOnHDFS(randFName);
        } catch (IOException e) {
          throw new DMLRuntimeException(
              "Cannot merge the output into single file: " + e.getMessage());
        }
      }
    } else {
      rdd.saveAsTextFile(fname);
    }
  }
Esempio n. 2
0
  @Override
  public void writeMatrixToHDFS(
      MatrixBlock src, String fname, long rlen, long clen, int brlen, int bclen, long nnz)
      throws IOException, DMLRuntimeException, DMLUnsupportedOperationException {
    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(fname);

    // if the file already exists on HDFS, remove it.
    MapReduceTool.deleteFileIfExistOnHDFS(fname);

    // core write
    writeTextCellMatrixToHDFS(path, job, src, rlen, clen);
  }
Esempio n. 3
0
  public void configure(JobConf job) {
    super.configure(job);
    if (resultIndexes.length > 1) throw new RuntimeException("MMCJMR only outputs one result");

    outputDummyRecords = MapReduceTool.getUniqueKeyPerTask(job, false).equals("0");

    try {
      // valueBuffer=valueClass.newInstance();
      valueBuffer = buffer;
      remainingbuffer = new RemainIndexValue(valueClass);
    } catch (Exception e) {
      throw new RuntimeException(e);
    }

    int blockRlen = dim1.getRowsPerBlock();
    int blockClen = dim2.getColsPerBlock();
    int elementSize = (int) Math.ceil((double) (77 + 8 * blockRlen * blockClen + 20 + 12) / 0.75);
    OUT_CACHE_SIZE =
        ((long) OptimizerUtils.getLocalMemBudget() // current jvm max mem
                - MRJobConfiguration.getMMCJCacheSize(job))
            / elementSize;
    outCache = new HashMap<MatrixIndexes, MatrixValue>(1024);
  }
  @Override
  public void processInstruction(ExecutionContext ec)
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;

    // get filename (literal or variable expression)
    String fname =
        ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue();

    try {
      // if the file already exists on HDFS, remove it.
      MapReduceTool.deleteFileIfExistOnHDFS(fname);

      // prepare output info according to meta data
      String outFmt = input3.getName();
      OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt);

      // get input rdd
      JavaPairRDD<MatrixIndexes, MatrixBlock> in1 =
          sec.getBinaryBlockRDDHandleForVariable(input1.getName());
      MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());

      if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
        // recompute nnz if necessary (required for header if matrix market)
        if (isInputMatrixBlock && !mc.nnzKnown())
          mc.setNonZeros(SparkUtils.computeNNZFromBlocks(in1));

        JavaRDD<String> header = null;
        if (outFmt.equalsIgnoreCase("matrixmarket")) {
          ArrayList<String> headerContainer = new ArrayList<String>(1);
          // First output MM header
          String headerStr =
              "%%MatrixMarket matrix coordinate real general\n"
                  +
                  // output number of rows, number of columns and number of nnz
                  mc.getRows()
                  + " "
                  + mc.getCols()
                  + " "
                  + mc.getNonZeros();
          headerContainer.add(headerStr);
          header = sec.getSparkContext().parallelize(headerContainer);
        }

        JavaRDD<String> ijv =
            in1.flatMap(
                new ConvertMatrixBlockToIJVLines(mc.getRowsPerBlock(), mc.getColsPerBlock()));
        if (header != null) customSaveTextFile(header.union(ijv), fname, true);
        else customSaveTextFile(ijv, fname, false);
      } else if (oi == OutputInfo.CSVOutputInfo) {
        JavaRDD<String> out = null;
        Accumulator<Double> aNnz = null;

        if (isInputMatrixBlock) {
          // piggyback nnz computation on actual write
          if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().accumulator(0L);
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
          }

          out =
              RDDConverterUtils.binaryBlockToCsv(
                  in1, mc, (CSVFileFormatProperties) formatProperties, true);
        } else {
          // This case is applicable when the CSV output from transform() is written out
          @SuppressWarnings("unchecked")
          JavaPairRDD<Long, String> rdd =
              (JavaPairRDD<Long, String>)
                  ((MatrixObject) sec.getVariable(input1.getName())).getRDDHandle().getRDD();
          out = rdd.values();

          String sep = ",";
          boolean hasHeader = false;
          if (formatProperties != null) {
            sep = ((CSVFileFormatProperties) formatProperties).getDelim();
            hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
          }

          if (hasHeader) {
            StringBuffer buf = new StringBuffer();
            for (int j = 1; j < mc.getCols(); j++) {
              if (j != 1) {
                buf.append(sep);
              }
              buf.append("C" + j);
            }
            ArrayList<String> headerContainer = new ArrayList<String>(1);
            headerContainer.add(0, buf.toString());
            JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
            out = header.union(out);
          }
        }

        customSaveTextFile(out, fname, false);

        if (isInputMatrixBlock && !mc.nnzKnown()) mc.setNonZeros((long) aNnz.value().longValue());
      } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        // piggyback nnz computation on actual write
        Accumulator<Double> aNnz = null;
        if (!mc.nnzKnown()) {
          aNnz = sec.getSparkContext().accumulator(0L);
          in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }

        // save binary block rdd on hdfs
        in1.saveAsHadoopFile(
            fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);

        if (!mc.nnzKnown()) mc.setNonZeros((long) aNnz.value().longValue());
      } else {
        // unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + outFmt);
      }

      // write meta data file
      MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
    } catch (IOException ex) {
      throw new DMLRuntimeException("Failed to process write instruction", ex);
    }
  }
  /**
   * @param sparseM1
   * @param sparseM2
   * @param instType
   */
  private void runMinMaxComparisonTest(
      OpType type,
      DataType dtM1,
      DataType dtM2,
      boolean sparseM1,
      boolean sparseM2,
      ExecType instType) {
    // rtplatform for MR
    RUNTIME_PLATFORM platformOld = rtplatform;
    rtplatform = (instType == ExecType.MR) ? RUNTIME_PLATFORM.HADOOP : RUNTIME_PLATFORM.HYBRID;

    // get the testname
    String TEST_NAME = null;
    int minFlag = (type == OpType.MIN) ? 1 : 0;
    boolean s1Flag = (dtM1 == DataType.SCALAR);
    boolean s2Flag = (dtM2 == DataType.SCALAR);

    if (s1Flag && s2Flag) TEST_NAME = TEST_NAME4;
    else if (s1Flag) TEST_NAME = TEST_NAME2;
    else if (s2Flag) TEST_NAME = TEST_NAME3;
    else TEST_NAME = TEST_NAME1;

    String TEST_CACHE_DIR = "";
    if (TEST_CACHE_ENABLED) {
      int mrows1 = (dtM1 == DataType.MATRIX) ? rows : 1;
      int mrows2 = (dtM2 == DataType.MATRIX) ? rows : 1;

      double sparsityLeft = sparseM1 ? sparsity2 : sparsity1;
      double sparsityRight = sparseM2 ? sparsity2 : sparsity1;

      TEST_CACHE_DIR =
          minFlag + "_" + mrows1 + "_" + mrows2 + "_" + sparsityLeft + "_" + sparsityRight + "/";
    }

    try {
      TestConfiguration config = getTestConfiguration(TEST_NAME);
      loadTestConfiguration(config, TEST_CACHE_DIR);

      // This is for running the junit test the new way, i.e., construct the arguments directly
      String HOME = SCRIPT_DIR + TEST_DIR;
      fullDMLScriptName = HOME + TEST_NAME + ".dml";
      programArgs =
          new String[] {
            "-explain", "-args", input("A"), input("B"), Integer.toString(minFlag), output("C")
          };

      fullRScriptName = HOME + TEST_NAME_R + ".R";
      rCmd =
          "Rscript"
              + " "
              + fullRScriptName
              + " "
              + inputDir()
              + " "
              + minFlag
              + " "
              + expectedDir();

      // generate actual dataset
      int mrows1 = (dtM1 == DataType.MATRIX) ? rows : 1;
      int mcols1 = (dtM1 == DataType.MATRIX) ? cols : 1;
      int mrows2 = (dtM2 == DataType.MATRIX) ? rows : 1;
      int mcols2 = (dtM2 == DataType.MATRIX) ? cols : 1;
      double[][] A = getRandomMatrix(mrows1, mcols1, -1, 1, sparseM1 ? sparsity2 : sparsity1, 7);
      writeInputMatrix("A", A, true);
      MatrixCharacteristics mc1 = new MatrixCharacteristics(mrows1, mcols1, 1000, 1000);
      MapReduceTool.writeMetaDataFile(
          input("A.mtd"), ValueType.DOUBLE, mc1, OutputInfo.TextCellOutputInfo);

      double[][] B = getRandomMatrix(mrows2, mcols2, -1, 1, sparseM2 ? sparsity2 : sparsity1, 3);
      writeInputMatrix("B", B, true);
      MatrixCharacteristics mc2 = new MatrixCharacteristics(mrows2, mcols2, 1000, 1000);
      MapReduceTool.writeMetaDataFile(
          input("B.mtd"), ValueType.DOUBLE, mc2, OutputInfo.TextCellOutputInfo);

      // run test
      runTest(true, false, null, -1);
      runRScript(true);

      // compare matrices
      HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("C");
      HashMap<CellIndex, Double> rfile = readRMatrixFromFS("C");
      TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
    } catch (IOException e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    } finally {
      rtplatform = platformOld;
    }
  }