/**
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @SuppressWarnings("deprecation")
  private void readBinaryBlockMatrixBlocksFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      Collection<IndexedMatrixValue> dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException {
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();

    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
      MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
    {
      // directly read from sequence files (individual partfiles)
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

      try {
        while (reader.next(key, value)) {
          int row_offset = (int) (key.getRowIndex() - 1) * brlen;
          int col_offset = (int) (key.getColumnIndex() - 1) * bclen;
          int rows = value.getNumRows();
          int cols = value.getNumColumns();

          // bound check per block
          if (row_offset + rows < 0
              || row_offset + rows > rlen
              || col_offset + cols < 0
              || col_offset + cols > clen) {
            throw new IOException(
                "Matrix block ["
                    + (row_offset + 1)
                    + ":"
                    + (row_offset + rows)
                    + ","
                    + (col_offset + 1)
                    + ":"
                    + (col_offset + cols)
                    + "] "
                    + "out of overall matrix range [1:"
                    + rlen
                    + ",1:"
                    + clen
                    + "].");
          }

          // copy block to result
          dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value)));
        }
      } finally {
        IOUtilFunctions.closeSilently(reader);
      }
    }
  }
Exemple #2
0
    public void configure(JobConf job) {
      super.configure(job);
      try {
        comb_instructions = MRJobConfiguration.getCombineInstruction(job);

      } catch (Exception e) {
        throw new RuntimeException(e);
      }
      for (int i = 0; i < resultIndexes.length; i++) {
        MatrixCharacteristics stat =
            MRJobConfiguration.getMatrixCharacteristicsForOutput(job, resultIndexes[i]);
        outputBlockSizes.put(
            resultIndexes[i],
            new Pair<Integer, Integer>(stat.getRowsPerBlock(), stat.getColsPerBlock()));
      }
      for (MRInstruction ins : comb_instructions) {
        outputIndexesMapping.put(ins.output, getOutputIndexes(ins.output));
      }
    }
Exemple #3
0
  public static JobReturn runJob(
      MRJobInstruction inst,
      String[] inputs,
      InputInfo[] inputInfos,
      long[] rlens,
      long[] clens,
      int[] brlens,
      int[] bclens,
      String combineInstructions,
      int numReducers,
      int replication,
      byte[] resultIndexes,
      String[] outputs,
      OutputInfo[] outputInfos)
      throws Exception {
    JobConf job;
    job = new JobConf(CombineMR.class);
    job.setJobName("Standalone-MR");

    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);

    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

    byte[] inputIndexes = new byte[inputs.length];
    for (byte b = 0; b < inputs.length; b++) inputIndexes[b] = b;

    // set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(
        job,
        inputIndexes,
        inputs,
        inputInfos,
        brlens,
        bclens,
        true,
        inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);

    // set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens);

    // set up the block size
    MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens);

    // set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, "");

    // set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, "");

    // set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, "");

    MRJobConfiguration.setCombineInstructions(job, combineInstructions);

    // set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    // set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes =
        MRJobConfiguration.setUpOutputIndexesForMapper(
            job, inputIndexes, null, null, combineInstructions, resultIndexes);

    // set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(
        job, resultIndexes, null, outputs, outputInfos, inBlockRepresentation);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);

    job.setMapOutputKeyClass(MatrixIndexes.class);
    if (inBlockRepresentation) job.setMapOutputValueClass(TaggedMatrixBlock.class);
    else job.setMapOutputValueClass(TaggedMatrixCell.class);

    // configure reducer
    job.setReducerClass(InnerReducer.class);
    // job.setReducerClass(PassThroughReducer.class);

    MatrixChar_N_ReducerGroups ret =
        MRJobConfiguration.computeMatrixCharacteristics(
            job,
            inputIndexes,
            null,
            null,
            null,
            combineInstructions,
            resultIndexes,
            mapoutputIndexes,
            false);
    MatrixCharacteristics[] stats = ret.stats;

    // set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

    // Print the complete instruction
    if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
      inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    // set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    return new JobReturn(stats, runjob.isSuccessful());
  }
  /**
   * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat-
   * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the
   * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is
   * advantageous if the actual sequence files are larger than the file splits created by
   * informat.getSplits (which is usually aligned to the HDFS block size) because then there is
   * overhead for finding the actual split between our 1k-1k blocks. This case happens if the read
   * matrix was create by CP or when jobs directly write to large output files (e.g., parfor matrix
   * partitioning).
   *
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   * @throws IllegalAccessException
   * @throws InstantiationException
   * @throws DMLRuntimeException
   */
  @SuppressWarnings("deprecation")
  private static void readBinaryBlockMatrixFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      MatrixBlock dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException, DMLRuntimeException {
    boolean sparse = dest.isInSparseFormat();
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();

    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
      MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
    {
      // directly read from sequence files (individual partfiles)
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

      try {
        // note: next(key, value) does not yet exploit the given serialization classes, record
        // reader does but is generally slower.
        while (reader.next(key, value)) {
          // empty block filter (skip entire block)
          if (value.isEmptyBlock(false)) continue;

          int row_offset = (int) (key.getRowIndex() - 1) * brlen;
          int col_offset = (int) (key.getColumnIndex() - 1) * bclen;

          int rows = value.getNumRows();
          int cols = value.getNumColumns();

          // bound check per block
          if (row_offset + rows < 0
              || row_offset + rows > rlen
              || col_offset + cols < 0
              || col_offset + cols > clen) {
            throw new IOException(
                "Matrix block ["
                    + (row_offset + 1)
                    + ":"
                    + (row_offset + rows)
                    + ","
                    + (col_offset + 1)
                    + ":"
                    + (col_offset + cols)
                    + "] "
                    + "out of overall matrix range [1:"
                    + rlen
                    + ",1:"
                    + clen
                    + "].");
          }

          // copy block to result
          if (sparse) {
            dest.appendToSparse(value, row_offset, col_offset);
            // note: append requires final sort
          } else {
            dest.copy(
                row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false);
          }
        }
      } finally {
        IOUtilFunctions.closeSilently(reader);
      }
    }

    if (sparse && clen > bclen) {
      // no need to sort if 1 column block since always sorted
      dest.sortSparseRows();
    }
  }