@Override
  public final void writeMatrixToHDFS(
      MatrixBlock src, String fname, long rlen, long clen, int brlen, int bclen, long nnz)
      throws IOException, DMLRuntimeException {
    // validity check matrix dimensions
    if (src.getNumRows() != rlen || src.getNumColumns() != clen) {
      throw new IOException(
          "Matrix dimensions mismatch with metadata: "
              + src.getNumRows()
              + "x"
              + src.getNumColumns()
              + " vs "
              + rlen
              + "x"
              + clen
              + ".");
    }

    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    // if the file already exists on HDFS, remove it.
    MapReduceTool.deleteFileIfExistOnHDFS(fname);

    // core write (sequential/parallel)
    writeCSVMatrixToHDFS(path, job, fs, src, _props);

    IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
  }
Esempio n. 2
0
  /**
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @SuppressWarnings("deprecation")
  private void readBinaryBlockMatrixBlocksFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      Collection<IndexedMatrixValue> dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException {
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();

    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
      MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
    {
      // directly read from sequence files (individual partfiles)
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

      try {
        while (reader.next(key, value)) {
          int row_offset = (int) (key.getRowIndex() - 1) * brlen;
          int col_offset = (int) (key.getColumnIndex() - 1) * bclen;
          int rows = value.getNumRows();
          int cols = value.getNumColumns();

          // bound check per block
          if (row_offset + rows < 0
              || row_offset + rows > rlen
              || col_offset + cols < 0
              || col_offset + cols > clen) {
            throw new IOException(
                "Matrix block ["
                    + (row_offset + 1)
                    + ":"
                    + (row_offset + rows)
                    + ","
                    + (col_offset + 1)
                    + ":"
                    + (col_offset + cols)
                    + "] "
                    + "out of overall matrix range [1:"
                    + rlen
                    + ",1:"
                    + clen
                    + "].");
          }

          // copy block to result
          dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value)));
        }
      } finally {
        IOUtilFunctions.closeSilently(reader);
      }
    }
  }
  @Override
  public final void writeEmptyMatrixToHDFS(String fname, long rlen, long clen, int brlen, int bclen)
      throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    MatrixBlock src = new MatrixBlock((int) rlen, 1, true);
    writeCSVMatrixToHDFS(path, job, fs, src, _props);

    IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
  }
  /**
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  public final void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
      // simply move srcFile to destFile

      /*
       * TODO: Remove this roundabout way!
       * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv
       *              & the only path that exists already on HDFS is /user/biadmin/csv/.
       * In this case: the directory structure /user/biadmin/csv/temp/out must be created.
       * Simple hdfs.rename() does not seem to create this directory structure.
       */

      // delete the destination file, if exists already
      // boolean ret1 =
      hdfs.delete(destFilePath, true);

      // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
      // boolean ret2 =
      hdfs.createNewFile(destFilePath);

      // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
      // boolean ret3 =
      hdfs.delete(destFilePath, true);

      // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
      // boolean ret4 =
      hdfs.rename(srcFilePath, destFilePath);

      // System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3
      // + ", rename:" + ret4);
      return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
      sb.append("C" + (i + 1));
      if (i < clen - 1) sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

      // compute sorted order among part files
      ArrayList<Path> files = new ArrayList<Path>();
      for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
        files.add(stat.getPath());
      Collections.sort(files);

      // first part file path
      Path firstpart = files.get(0);

      // create a temp file, and add header and contents of first part
      Path tmp = new Path(firstpart.toString() + ".tmp");
      OutputStream out = hdfs.create(tmp, true);
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy rest of the data from firstpart
      InputStream in = null;
      try {
        in = hdfs.open(firstpart);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }

      // rename tmp to firstpart
      hdfs.delete(firstpart, true);
      hdfs.rename(tmp, firstpart);

      // rename srcfile to destFile
      hdfs.delete(destFilePath, true);
      hdfs.createNewFile(destFilePath); // force the creation of directory structure
      hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
      hdfs.rename(srcFilePath, destFilePath); // move the data

    } else if (hdfs.isFile(srcFilePath)) {
      // create destination file
      OutputStream out = hdfs.create(destFilePath, true);

      // write header
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy the data from srcFile
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
  /**
   * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. The part files
   * are created by CSV_WRITE MR job.
   *
   * <p>This method is invoked from CP-write instruction.
   *
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  public final void mergeCSVPartFiles(
      String srcFileName,
      String destFileName,
      CSVFileFormatProperties csvprop,
      long rlen,
      long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path mergedFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(mergedFilePath)) {
      hdfs.delete(mergedFilePath, true);
    }
    OutputStream out = hdfs.create(mergedFilePath, true);

    // write out the header, if needed
    if (csvprop.hasHeader()) {
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1) sb.append(csvprop.getDelim());
      }
      sb.append('\n');
      out.write(sb.toString().getBytes());
      sb.setLength(0);
    }

    // if the source is a directory
    if (hdfs.isDirectory(srcFilePath)) {
      try {
        FileStatus[] contents = hdfs.listStatus(srcFilePath);
        Path[] partPaths = new Path[contents.length];
        int numPartFiles = 0;
        for (int i = 0; i < contents.length; i++) {
          if (!contents[i].isDirectory()) {
            partPaths[i] = contents[i].getPath();
            numPartFiles++;
          }
        }
        Arrays.sort(partPaths);

        for (int i = 0; i < numPartFiles; i++) {
          InputStream in = hdfs.open(partPaths[i]);
          try {
            IOUtils.copyBytes(in, out, conf, false);
            if (i < numPartFiles - 1) out.write('\n');
          } finally {
            IOUtilFunctions.closeSilently(in);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(out);
      }
    } else if (hdfs.isFile(srcFilePath)) {
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
  /**
   * @param fileName
   * @param src
   * @param rlen
   * @param clen
   * @param nnz
   * @throws IOException
   */
  protected final void writeCSVMatrixToFile(
      Path path,
      JobConf job,
      FileSystem fs,
      MatrixBlock src,
      int rl,
      int ru,
      CSVFileFormatProperties props)
      throws IOException {
    boolean sparse = src.isInSparseFormat();
    int clen = src.getNumColumns();

    // create buffered writer
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

    try {
      // for obj reuse and preventing repeated buffer re-allocations
      StringBuilder sb = new StringBuilder();

      props = (props == null) ? new CSVFileFormatProperties() : props;
      String delim = props.getDelim();
      boolean csvsparse = props.isSparse();

      // Write header line, if needed
      if (props.hasHeader() && rl == 0) {
        // write row chunk-wise to prevent OOM on large number of columns
        for (int bj = 0; bj < clen; bj += BLOCKSIZE_J) {
          for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
            sb.append("C" + (j + 1));
            if (j < clen - 1) sb.append(delim);
          }
          br.write(sb.toString());
          sb.setLength(0);
        }
        sb.append('\n');
        br.write(sb.toString());
        sb.setLength(0);
      }

      // Write data lines
      if (sparse) // SPARSE
      {
        SparseBlock sblock = src.getSparseBlock();
        for (int i = rl; i < ru; i++) {
          // write row chunk-wise to prevent OOM on large number of columns
          int prev_jix = -1;
          if (sblock != null && i < sblock.numRows() && !sblock.isEmpty(i)) {
            int pos = sblock.pos(i);
            int alen = sblock.size(i);
            int[] aix = sblock.indexes(i);
            double[] avals = sblock.values(i);

            for (int j = pos; j < pos + alen; j++) {
              int jix = aix[j];

              // output empty fields, if needed
              for (int j2 = prev_jix; j2 < jix - 1; j2++) {
                if (!csvsparse) sb.append('0');
                sb.append(delim);

                // flush buffered string
                if (j2 % BLOCKSIZE_J == 0) {
                  br.write(sb.toString());
                  sb.setLength(0);
                }
              }

              // output the value (non-zero)
              sb.append(avals[j]);
              if (jix < clen - 1) sb.append(delim);
              br.write(sb.toString());
              sb.setLength(0);

              // flush buffered string
              if (jix % BLOCKSIZE_J == 0) {
                br.write(sb.toString());
                sb.setLength(0);
              }

              prev_jix = jix;
            }
          }

          // Output empty fields at the end of the row.
          // In case of an empty row, output (clen-1) empty fields
          for (int bj = prev_jix + 1; bj < clen; bj += BLOCKSIZE_J) {
            for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
              if (!csvsparse) sb.append('0');
              if (j < clen - 1) sb.append(delim);
            }
            br.write(sb.toString());
            sb.setLength(0);
          }

          sb.append('\n');
          br.write(sb.toString());
          sb.setLength(0);
        }
      } else // DENSE
      {
        for (int i = rl; i < ru; i++) {
          // write row chunk-wise to prevent OOM on large number of columns
          for (int bj = 0; bj < clen; bj += BLOCKSIZE_J) {
            for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
              double lvalue = src.getValueDenseUnsafe(i, j);
              if (lvalue != 0) // for nnz
              sb.append(lvalue);
              else if (!csvsparse) sb.append('0');

              if (j != clen - 1) sb.append(delim);
            }
            br.write(sb.toString());
            sb.setLength(0);
          }

          sb.append('\n');
          br.write(sb.toString()); // same as append
          sb.setLength(0);
        }
      }
    } finally {
      IOUtilFunctions.closeSilently(br);
    }
  }
  /**
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   */
  @SuppressWarnings("deprecation")
  private void readBinaryCellMatrixFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      MatrixBlock dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException {
    boolean sparse = dest.isInSparseFormat();
    MatrixIndexes key = new MatrixIndexes();
    MatrixCell value = new MatrixCell();
    int row = -1;
    int col = -1;

    try {
      for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
      {
        // directly read from sequence files (individual partfiles)
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

        try {
          if (sparse) {
            while (reader.next(key, value)) {
              row = (int) key.getRowIndex() - 1;
              col = (int) key.getColumnIndex() - 1;
              double lvalue = value.getValue();
              dest.appendValue(row, col, lvalue);
            }
          } else {
            while (reader.next(key, value)) {
              row = (int) key.getRowIndex() - 1;
              col = (int) key.getColumnIndex() - 1;
              double lvalue = value.getValue();
              dest.appendValue(row, col, lvalue);
            }
          }
        } finally {
          IOUtilFunctions.closeSilently(reader);
        }
      }

      if (sparse) dest.sortSparseRows();
    } catch (Exception ex) {
      // post-mortem error handling and bounds checking
      if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
        throw new IOException(
            "Matrix cell ["
                + (row + 1)
                + ","
                + (col + 1)
                + "] "
                + "out of overall matrix range [1:"
                + rlen
                + ",1:"
                + clen
                + "].");
      } else {
        throw new IOException("Unable to read matrix in binary cell format.", ex);
      }
    }
  }
Esempio n. 8
0
  /**
   * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat-
   * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the
   * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is
   * advantageous if the actual sequence files are larger than the file splits created by
   * informat.getSplits (which is usually aligned to the HDFS block size) because then there is
   * overhead for finding the actual split between our 1k-1k blocks. This case happens if the read
   * matrix was create by CP or when jobs directly write to large output files (e.g., parfor matrix
   * partitioning).
   *
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   * @throws IllegalAccessException
   * @throws InstantiationException
   * @throws DMLRuntimeException
   */
  @SuppressWarnings("deprecation")
  private static void readBinaryBlockMatrixFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      MatrixBlock dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException, DMLRuntimeException {
    boolean sparse = dest.isInSparseFormat();
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();

    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
      MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
    {
      // directly read from sequence files (individual partfiles)
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

      try {
        // note: next(key, value) does not yet exploit the given serialization classes, record
        // reader does but is generally slower.
        while (reader.next(key, value)) {
          // empty block filter (skip entire block)
          if (value.isEmptyBlock(false)) continue;

          int row_offset = (int) (key.getRowIndex() - 1) * brlen;
          int col_offset = (int) (key.getColumnIndex() - 1) * bclen;

          int rows = value.getNumRows();
          int cols = value.getNumColumns();

          // bound check per block
          if (row_offset + rows < 0
              || row_offset + rows > rlen
              || col_offset + cols < 0
              || col_offset + cols > clen) {
            throw new IOException(
                "Matrix block ["
                    + (row_offset + 1)
                    + ":"
                    + (row_offset + rows)
                    + ","
                    + (col_offset + 1)
                    + ":"
                    + (col_offset + cols)
                    + "] "
                    + "out of overall matrix range [1:"
                    + rlen
                    + ",1:"
                    + clen
                    + "].");
          }

          // copy block to result
          if (sparse) {
            dest.appendToSparse(value, row_offset, col_offset);
            // note: append requires final sort
          } else {
            dest.copy(
                row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(reader);
      }
    }

    if (sparse && clen > bclen) {
      // no need to sort if 1 column block since always sorted
      dest.sortSparseRows();
    }
  }
Esempio n. 9
0
  /**
   * @param path
   * @param job
   * @param src
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   */
  protected void writeTextCellMatrixToHDFS(
      Path path, JobConf job, MatrixBlock src, long rlen, long clen) throws IOException {
    boolean sparse = src.isInSparseFormat();
    boolean entriesWritten = false;
    FileSystem fs = FileSystem.get(job);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

    int rows = src.getNumRows();
    int cols = src.getNumColumns();

    // bound check per block
    if (rows > rlen || cols > clen) {
      throw new IOException(
          "Matrix block [1:"
              + rows
              + ",1:"
              + cols
              + "] "
              + "out of overall matrix range [1:"
              + rlen
              + ",1:"
              + clen
              + "].");
    }

    try {
      // for obj reuse and preventing repeated buffer re-allocations
      StringBuilder sb = new StringBuilder();

      if (sparse) // SPARSE
      {
        SparseRowsIterator iter = src.getSparseRowsIterator();
        while (iter.hasNext()) {
          IJV cell = iter.next();

          sb.append(cell.i + 1);
          sb.append(' ');
          sb.append(cell.j + 1);
          sb.append(' ');
          sb.append(cell.v);
          sb.append('\n');
          br.write(sb.toString()); // same as append
          sb.setLength(0);
          entriesWritten = true;
        }
      } else // DENSE
      {
        for (int i = 0; i < rows; i++) {
          String rowIndex = Integer.toString(i + 1);
          for (int j = 0; j < cols; j++) {
            double lvalue = src.getValueDenseUnsafe(i, j);
            if (lvalue != 0) // for nnz
            {
              sb.append(rowIndex);
              sb.append(' ');
              sb.append(j + 1);
              sb.append(' ');
              sb.append(lvalue);
              sb.append('\n');
              br.write(sb.toString()); // same as append
              sb.setLength(0);
              entriesWritten = true;
            }
          }
        }
      }

      // handle empty result
      if (!entriesWritten) {
        br.write("1 1 0\n");
      }
    } finally {
      IOUtilFunctions.closeSilently(br);
    }
  }