@Override
  public final void writeMatrixToHDFS(
      MatrixBlock src, String fname, long rlen, long clen, int brlen, int bclen, long nnz)
      throws IOException, DMLRuntimeException {
    // validity check matrix dimensions
    if (src.getNumRows() != rlen || src.getNumColumns() != clen) {
      throw new IOException(
          "Matrix dimensions mismatch with metadata: "
              + src.getNumRows()
              + "x"
              + src.getNumColumns()
              + " vs "
              + rlen
              + "x"
              + clen
              + ".");
    }

    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    // if the file already exists on HDFS, remove it.
    MapReduceTool.deleteFileIfExistOnHDFS(fname);

    // core write (sequential/parallel)
    writeCSVMatrixToHDFS(path, job, fs, src, _props);

    IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
  }
  @Override
  public MatrixBlock readMatrixFromHDFS(
      String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
      throws IOException, DMLRuntimeException {
    // allocate output matrix block
    MatrixBlock ret =
        createOutputMatrixBlock(rlen, clen, (int) rlen, (int) clen, estnnz, true, false);

    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    // check existence and non-empty file
    checkValidInputFile(fs, path);

    // core read
    readBinaryCellMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen);

    // finally check if change of sparse/dense block representation required
    // (nnz maintained via append during read for both dense/sparse)
    ret.examSparsity();

    return ret;
  }
  @Override
  public final void writeEmptyMatrixToHDFS(String fname, long rlen, long clen, int brlen, int bclen)
      throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    MatrixBlock src = new MatrixBlock((int) rlen, 1, true);
    writeCSVMatrixToHDFS(path, job, fs, src, _props);

    IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
  }
示例#4
0
  private static void writeCleanupTasksToFile(Path path, int numTasks)
      throws DMLRuntimeException, IOException {
    BufferedWriter br = null;
    try {
      FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
      br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

      for (int i = 1; i <= numTasks; i++) br.write(String.valueOf("CLEANUP TASK " + i) + "\n");
    } catch (Exception ex) {
      throw new DMLRuntimeException(
          "Error writing cleanup tasks to taskfile " + path.toString(), ex);
    } finally {
      if (br != null) br.close();
    }
  }
  /**
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  public final void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
      // simply move srcFile to destFile

      /*
       * TODO: Remove this roundabout way!
       * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv
       *              & the only path that exists already on HDFS is /user/biadmin/csv/.
       * In this case: the directory structure /user/biadmin/csv/temp/out must be created.
       * Simple hdfs.rename() does not seem to create this directory structure.
       */

      // delete the destination file, if exists already
      // boolean ret1 =
      hdfs.delete(destFilePath, true);

      // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
      // boolean ret2 =
      hdfs.createNewFile(destFilePath);

      // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
      // boolean ret3 =
      hdfs.delete(destFilePath, true);

      // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
      // boolean ret4 =
      hdfs.rename(srcFilePath, destFilePath);

      // System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3
      // + ", rename:" + ret4);
      return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
      sb.append("C" + (i + 1));
      if (i < clen - 1) sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

      // compute sorted order among part files
      ArrayList<Path> files = new ArrayList<Path>();
      for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
        files.add(stat.getPath());
      Collections.sort(files);

      // first part file path
      Path firstpart = files.get(0);

      // create a temp file, and add header and contents of first part
      Path tmp = new Path(firstpart.toString() + ".tmp");
      OutputStream out = hdfs.create(tmp, true);
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy rest of the data from firstpart
      InputStream in = null;
      try {
        in = hdfs.open(firstpart);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }

      // rename tmp to firstpart
      hdfs.delete(firstpart, true);
      hdfs.rename(tmp, firstpart);

      // rename srcfile to destFile
      hdfs.delete(destFilePath, true);
      hdfs.createNewFile(destFilePath); // force the creation of directory structure
      hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
      hdfs.rename(srcFilePath, destFilePath); // move the data

    } else if (hdfs.isFile(srcFilePath)) {
      // create destination file
      OutputStream out = hdfs.create(destFilePath, true);

      // write header
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy the data from srcFile
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
  /**
   * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. The part files
   * are created by CSV_WRITE MR job.
   *
   * <p>This method is invoked from CP-write instruction.
   *
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  public final void mergeCSVPartFiles(
      String srcFileName,
      String destFileName,
      CSVFileFormatProperties csvprop,
      long rlen,
      long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path mergedFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(mergedFilePath)) {
      hdfs.delete(mergedFilePath, true);
    }
    OutputStream out = hdfs.create(mergedFilePath, true);

    // write out the header, if needed
    if (csvprop.hasHeader()) {
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1) sb.append(csvprop.getDelim());
      }
      sb.append('\n');
      out.write(sb.toString().getBytes());
      sb.setLength(0);
    }

    // if the source is a directory
    if (hdfs.isDirectory(srcFilePath)) {
      try {
        FileStatus[] contents = hdfs.listStatus(srcFilePath);
        Path[] partPaths = new Path[contents.length];
        int numPartFiles = 0;
        for (int i = 0; i < contents.length; i++) {
          if (!contents[i].isDirectory()) {
            partPaths[i] = contents[i].getPath();
            numPartFiles++;
          }
        }
        Arrays.sort(partPaths);

        for (int i = 0; i < numPartFiles; i++) {
          InputStream in = hdfs.open(partPaths[i]);
          try {
            IOUtils.copyBytes(in, out, conf, false);
            if (i < numPartFiles - 1) out.write('\n');
          } finally {
            IOUtilFunctions.closeSilently(in);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(out);
      }
    } else if (hdfs.isFile(srcFilePath)) {
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }