コード例 #1
0
  /**
   * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. The part files
   * are created by CSV_WRITE MR job.
   *
   * <p>This method is invoked from CP-write instruction.
   *
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  public final void mergeCSVPartFiles(
      String srcFileName,
      String destFileName,
      CSVFileFormatProperties csvprop,
      long rlen,
      long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path mergedFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(mergedFilePath)) {
      hdfs.delete(mergedFilePath, true);
    }
    OutputStream out = hdfs.create(mergedFilePath, true);

    // write out the header, if needed
    if (csvprop.hasHeader()) {
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1) sb.append(csvprop.getDelim());
      }
      sb.append('\n');
      out.write(sb.toString().getBytes());
      sb.setLength(0);
    }

    // if the source is a directory
    if (hdfs.isDirectory(srcFilePath)) {
      try {
        FileStatus[] contents = hdfs.listStatus(srcFilePath);
        Path[] partPaths = new Path[contents.length];
        int numPartFiles = 0;
        for (int i = 0; i < contents.length; i++) {
          if (!contents[i].isDirectory()) {
            partPaths[i] = contents[i].getPath();
            numPartFiles++;
          }
        }
        Arrays.sort(partPaths);

        for (int i = 0; i < numPartFiles; i++) {
          InputStream in = hdfs.open(partPaths[i]);
          try {
            IOUtils.copyBytes(in, out, conf, false);
            if (i < numPartFiles - 1) out.write('\n');
          } finally {
            IOUtilFunctions.closeSilently(in);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(out);
      }
    } else if (hdfs.isFile(srcFilePath)) {
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
コード例 #2
0
  /**
   * @param srcFileName
   * @param destFileName
   * @param csvprop
   * @param rlen
   * @param clen
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  public final void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen)
      throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
      // simply move srcFile to destFile

      /*
       * TODO: Remove this roundabout way!
       * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv
       *              & the only path that exists already on HDFS is /user/biadmin/csv/.
       * In this case: the directory structure /user/biadmin/csv/temp/out must be created.
       * Simple hdfs.rename() does not seem to create this directory structure.
       */

      // delete the destination file, if exists already
      // boolean ret1 =
      hdfs.delete(destFilePath, true);

      // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
      // boolean ret2 =
      hdfs.createNewFile(destFilePath);

      // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
      // boolean ret3 =
      hdfs.delete(destFilePath, true);

      // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
      // boolean ret4 =
      hdfs.rename(srcFilePath, destFilePath);

      // System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3
      // + ", rename:" + ret4);
      return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
      sb.append("C" + (i + 1));
      if (i < clen - 1) sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

      // compute sorted order among part files
      ArrayList<Path> files = new ArrayList<Path>();
      for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
        files.add(stat.getPath());
      Collections.sort(files);

      // first part file path
      Path firstpart = files.get(0);

      // create a temp file, and add header and contents of first part
      Path tmp = new Path(firstpart.toString() + ".tmp");
      OutputStream out = hdfs.create(tmp, true);
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy rest of the data from firstpart
      InputStream in = null;
      try {
        in = hdfs.open(firstpart);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }

      // rename tmp to firstpart
      hdfs.delete(firstpart, true);
      hdfs.rename(tmp, firstpart);

      // rename srcfile to destFile
      hdfs.delete(destFilePath, true);
      hdfs.createNewFile(destFilePath); // force the creation of directory structure
      hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
      hdfs.rename(srcFilePath, destFilePath); // move the data

    } else if (hdfs.isFile(srcFilePath)) {
      // create destination file
      OutputStream out = hdfs.create(destFilePath, true);

      // write header
      out.write(sb.toString().getBytes());
      sb.setLength(0);

      // copy the data from srcFile
      InputStream in = null;
      try {
        in = hdfs.open(srcFilePath);
        IOUtils.copyBytes(in, out, conf, true);
      } finally {
        IOUtilFunctions.closeSilently(in);
        IOUtilFunctions.closeSilently(out);
      }
    } else {
      throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
  }
コード例 #3
0
  /**
   * @param fileName
   * @param src
   * @param rlen
   * @param clen
   * @param nnz
   * @throws IOException
   */
  protected final void writeCSVMatrixToFile(
      Path path,
      JobConf job,
      FileSystem fs,
      MatrixBlock src,
      int rl,
      int ru,
      CSVFileFormatProperties props)
      throws IOException {
    boolean sparse = src.isInSparseFormat();
    int clen = src.getNumColumns();

    // create buffered writer
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

    try {
      // for obj reuse and preventing repeated buffer re-allocations
      StringBuilder sb = new StringBuilder();

      props = (props == null) ? new CSVFileFormatProperties() : props;
      String delim = props.getDelim();
      boolean csvsparse = props.isSparse();

      // Write header line, if needed
      if (props.hasHeader() && rl == 0) {
        // write row chunk-wise to prevent OOM on large number of columns
        for (int bj = 0; bj < clen; bj += BLOCKSIZE_J) {
          for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
            sb.append("C" + (j + 1));
            if (j < clen - 1) sb.append(delim);
          }
          br.write(sb.toString());
          sb.setLength(0);
        }
        sb.append('\n');
        br.write(sb.toString());
        sb.setLength(0);
      }

      // Write data lines
      if (sparse) // SPARSE
      {
        SparseBlock sblock = src.getSparseBlock();
        for (int i = rl; i < ru; i++) {
          // write row chunk-wise to prevent OOM on large number of columns
          int prev_jix = -1;
          if (sblock != null && i < sblock.numRows() && !sblock.isEmpty(i)) {
            int pos = sblock.pos(i);
            int alen = sblock.size(i);
            int[] aix = sblock.indexes(i);
            double[] avals = sblock.values(i);

            for (int j = pos; j < pos + alen; j++) {
              int jix = aix[j];

              // output empty fields, if needed
              for (int j2 = prev_jix; j2 < jix - 1; j2++) {
                if (!csvsparse) sb.append('0');
                sb.append(delim);

                // flush buffered string
                if (j2 % BLOCKSIZE_J == 0) {
                  br.write(sb.toString());
                  sb.setLength(0);
                }
              }

              // output the value (non-zero)
              sb.append(avals[j]);
              if (jix < clen - 1) sb.append(delim);
              br.write(sb.toString());
              sb.setLength(0);

              // flush buffered string
              if (jix % BLOCKSIZE_J == 0) {
                br.write(sb.toString());
                sb.setLength(0);
              }

              prev_jix = jix;
            }
          }

          // Output empty fields at the end of the row.
          // In case of an empty row, output (clen-1) empty fields
          for (int bj = prev_jix + 1; bj < clen; bj += BLOCKSIZE_J) {
            for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
              if (!csvsparse) sb.append('0');
              if (j < clen - 1) sb.append(delim);
            }
            br.write(sb.toString());
            sb.setLength(0);
          }

          sb.append('\n');
          br.write(sb.toString());
          sb.setLength(0);
        }
      } else // DENSE
      {
        for (int i = rl; i < ru; i++) {
          // write row chunk-wise to prevent OOM on large number of columns
          for (int bj = 0; bj < clen; bj += BLOCKSIZE_J) {
            for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
              double lvalue = src.getValueDenseUnsafe(i, j);
              if (lvalue != 0) // for nnz
              sb.append(lvalue);
              else if (!csvsparse) sb.append('0');

              if (j != clen - 1) sb.append(delim);
            }
            br.write(sb.toString());
            sb.setLength(0);
          }

          sb.append('\n');
          br.write(sb.toString()); // same as append
          sb.setLength(0);
        }
      }
    } finally {
      IOUtilFunctions.closeSilently(br);
    }
  }