@Override
    public Tuple2<MatrixIndexes, MatrixBlock> call(Tuple2<MatrixIndexes, MatrixBlock> arg0)
        throws Exception {
      MatrixIndexes ixIn = arg0._1();
      MatrixBlock blkIn = arg0._2();

      MatrixIndexes ixOut = new MatrixIndexes();
      MatrixBlock blkOut = new MatrixBlock();

      if (_type == CacheType.LEFT) {
        // get the right hand side matrix
        MatrixBlock left = _pbc.getMatrixBlock(1, (int) ixIn.getRowIndex());

        // execute matrix-vector mult
        OperationsOnMatrixValues.performAggregateBinary(
            new MatrixIndexes(1, ixIn.getRowIndex()), left, ixIn, blkIn, ixOut, blkOut, _op);
      } else // if( _type == CacheType.RIGHT )
      {
        // get the right hand side matrix
        MatrixBlock right = _pbc.getMatrixBlock((int) ixIn.getColumnIndex(), 1);

        // execute matrix-vector mult
        OperationsOnMatrixValues.performAggregateBinary(
            ixIn, blkIn, new MatrixIndexes(ixIn.getColumnIndex(), 1), right, ixOut, blkOut, _op);
      }

      // output new tuple
      return new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut);
    }
  public static void performAggregateBinary(
      MatrixIndexes indexes1,
      MatrixValue value1,
      MatrixIndexes indexes2,
      MatrixValue value2,
      MatrixIndexes indexesOut,
      MatrixValue valueOut,
      AggregateBinaryOperator op)
      throws DMLRuntimeException {
    // compute output index
    indexesOut.setIndexes(indexes1.getRowIndex(), indexes2.getColumnIndex());

    // perform on the value
    value1.aggregateBinaryOperations(indexes1, value1, indexes2, value2, valueOut, op);
  }
  /**
   * @param ix
   * @param brlen
   * @param bclen
   * @param rl
   * @param ru
   * @param cl
   * @param cu
   * @return
   */
  public static boolean isInBlockRange(
      MatrixIndexes ix, int brlen, int bclen, long rl, long ru, long cl, long cu) {
    long bRLowerIndex = (ix.getRowIndex() - 1) * brlen + 1;
    long bRUpperIndex = ix.getRowIndex() * brlen;
    long bCLowerIndex = (ix.getColumnIndex() - 1) * bclen + 1;
    long bCUpperIndex = ix.getColumnIndex() * bclen;

    if (rl > bRUpperIndex || ru < bRLowerIndex) {
      return false;
    } else if (cl > bCUpperIndex || cu < bCLowerIndex) {
      return false;
    } else {
      return true;
    }
  }
    @Override
    public Iterable<Tuple2<MatrixIndexes, MatrixBlock>> call(
        Tuple2<MatrixIndexes, MatrixBlock> arg0) throws Exception {
      ArrayList<Tuple2<MatrixIndexes, MatrixBlock>> ret =
          new ArrayList<Tuple2<MatrixIndexes, MatrixBlock>>();
      MatrixIndexes ixIn = arg0._1();
      MatrixBlock mb2 = arg0._2();

      // get the right hand side matrix
      MatrixBlock mb1 = _pmV.getMatrixBlock((int) ixIn.getRowIndex(), 1);

      // compute target block indexes
      long minPos = UtilFunctions.toLong(mb1.minNonZero());
      long maxPos = UtilFunctions.toLong(mb1.max());
      long rowIX1 = (minPos - 1) / _brlen + 1;
      long rowIX2 = (maxPos - 1) / _brlen + 1;
      boolean multipleOuts = (rowIX1 != rowIX2);

      if (minPos >= 1) // at least one row selected
      {
        // output sparsity estimate
        double spmb1 = OptimizerUtils.getSparsity(mb1.getNumRows(), 1, mb1.getNonZeros());
        long estnnz = (long) (spmb1 * mb2.getNonZeros());
        boolean sparse = MatrixBlock.evalSparseFormatInMemory(_brlen, mb2.getNumColumns(), estnnz);

        // compute and allocate output blocks
        MatrixBlock out1 = new MatrixBlock();
        MatrixBlock out2 = multipleOuts ? new MatrixBlock() : null;
        out1.reset(_brlen, mb2.getNumColumns(), sparse);
        if (out2 != null)
          out2.reset(
              UtilFunctions.computeBlockSize(_rlen, rowIX2, _brlen), mb2.getNumColumns(), sparse);

        // compute core matrix permutation (assumes that out1 has default blocksize,
        // hence we do a meta data correction afterwards)
        mb1.permutationMatrixMultOperations(mb2, out1, out2);
        out1.setNumRows(UtilFunctions.computeBlockSize(_rlen, rowIX1, _brlen));
        ret.add(
            new Tuple2<MatrixIndexes, MatrixBlock>(
                new MatrixIndexes(rowIX1, ixIn.getColumnIndex()), out1));
        if (out2 != null)
          ret.add(
              new Tuple2<MatrixIndexes, MatrixBlock>(
                  new MatrixIndexes(rowIX2, ixIn.getColumnIndex()), out2));
      }

      return ret;
    }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLUnsupportedOperationException, DMLRuntimeException {

    ArrayList<IndexedMatrixValue> blkList = cachedValues.get(input);
    if (blkList != null)
      for (IndexedMatrixValue in : blkList) {
        if (in == null) continue;

        // allocate space for the output value
        IndexedMatrixValue out;
        if (input == output) out = tempValue;
        else out = cachedValues.holdPlace(output, valueClass);

        MatrixIndexes inix = in.getIndexes();

        // prune unnecessary blocks for trace
        if ((((AggregateUnaryOperator) optr).indexFn instanceof ReduceDiag
            && inix.getColumnIndex() != inix.getRowIndex())) {
          // do nothing (block not on diagonal); but reset
          out.getValue().reset();
        } else // general case
        {
          // process instruction
          AggregateUnaryOperator auop = (AggregateUnaryOperator) optr;
          OperationsOnMatrixValues.performAggregateUnary(
              inix,
              in.getValue(),
              out.getIndexes(),
              out.getValue(),
              auop,
              blockRowFactor,
              blockColFactor);
          if (_dropCorr)
            ((MatrixBlock) out.getValue()).dropLastRowsOrColums(auop.aggOp.correctionLocation);
        }

        // put the output value in the cache
        if (out == tempValue) cachedValues.add(output, out);
      }
  }
    @Override
    public Tuple2<MatrixIndexes, MatrixBlock> call(Tuple2<MatrixIndexes, MatrixBlock> arg0)
        throws Exception {
      MatrixBlock pmV = _pmV.getMatrixBlock(1, 1);

      MatrixIndexes ixIn = arg0._1();
      MatrixBlock blkIn = arg0._2();
      int rowIx = (int) ixIn.getRowIndex();

      MatrixIndexes ixOut = new MatrixIndexes(1, 1);
      MatrixBlock blkOut = new MatrixBlock();

      // execute mapmmchain operation
      blkIn.chainMatrixMultOperations(pmV, _pmW.getMatrixBlock(rowIx, 1), blkOut, ChainType.XtwXv);

      // output new tuple
      return new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut);
    }
    @Override
    public Iterable<Tuple2<MatrixIndexes, MatrixBlock>> call(
        Tuple2<MatrixIndexes, MatrixBlock> arg0) throws Exception {
      ArrayList<Tuple2<MatrixIndexes, MatrixBlock>> ret =
          new ArrayList<Tuple2<MatrixIndexes, MatrixBlock>>();

      MatrixIndexes ixIn = arg0._1();
      MatrixBlock blkIn = arg0._2();

      if (_type == CacheType.LEFT) {
        // for all matching left-hand-side blocks
        int len = _pbc.getNumRowBlocks();
        for (int i = 1; i <= len; i++) {
          MatrixBlock left = _pbc.getMatrixBlock(i, (int) ixIn.getRowIndex());
          MatrixIndexes ixOut = new MatrixIndexes();
          MatrixBlock blkOut = new MatrixBlock();

          // execute matrix-vector mult
          OperationsOnMatrixValues.performAggregateBinary(
              new MatrixIndexes(i, ixIn.getRowIndex()), left, ixIn, blkIn, ixOut, blkOut, _op);

          ret.add(new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut));
        }
      } else // if( _type == CacheType.RIGHT )
      {
        // for all matching right-hand-side blocks
        int len = _pbc.getNumColumnBlocks();
        for (int j = 1; j <= len; j++) {
          // get the right hand side matrix
          MatrixBlock right = _pbc.getMatrixBlock((int) ixIn.getColumnIndex(), j);
          MatrixIndexes ixOut = new MatrixIndexes();
          MatrixBlock blkOut = new MatrixBlock();

          // execute matrix-vector mult
          OperationsOnMatrixValues.performAggregateBinary(
              ixIn, blkIn, new MatrixIndexes(ixIn.getColumnIndex(), j), right, ixOut, blkOut, _op);

          ret.add(new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut));
        }
      }

      return ret;
    }
 public static void performZeroOut(
     MatrixIndexes indexesIn,
     MatrixValue valueIn,
     MatrixIndexes indexesOut,
     MatrixValue valueOut,
     IndexRange range,
     boolean complementary)
     throws DMLRuntimeException {
   valueIn.zeroOutOperations(valueOut, range, complementary);
   indexesOut.setIndexes(indexesIn);
 }
  /**
   * @param target
   * @param groups
   * @param brlen
   * @param bclen
   * @param outlist
   * @throws DMLRuntimeException
   */
  public static void performMapGroupedAggregate(
      Operator op,
      IndexedMatrixValue inTarget,
      MatrixBlock groups,
      int ngroups,
      int brlen,
      int bclen,
      ArrayList<IndexedMatrixValue> outlist)
      throws DMLRuntimeException {
    MatrixIndexes ix = inTarget.getIndexes();
    MatrixBlock target = (MatrixBlock) inTarget.getValue();

    // execute grouped aggregate operations
    MatrixBlock out = groups.groupedAggOperations(target, null, new MatrixBlock(), ngroups, op);

    if (out.getNumRows() <= brlen && out.getNumColumns() <= bclen) {
      // single output block
      outlist.add(new IndexedMatrixValue(new MatrixIndexes(1, ix.getColumnIndex()), out));
    } else {
      // multiple output blocks (by op def, single column block )
      for (int blockRow = 0;
          blockRow < (int) Math.ceil(out.getNumRows() / (double) brlen);
          blockRow++) {
        int maxRow =
            (blockRow * brlen + brlen < out.getNumRows())
                ? brlen
                : out.getNumRows() - blockRow * brlen;
        int row_offset = blockRow * brlen;

        // copy submatrix to block
        MatrixBlock tmp =
            out.sliceOperations(
                row_offset, row_offset + maxRow - 1, 0, out.getNumColumns() - 1, new MatrixBlock());

        // append block to result cache
        outlist.add(
            new IndexedMatrixValue(new MatrixIndexes(blockRow + 1, ix.getColumnIndex()), tmp));
      }
    }
  }
    @Override
    public Tuple2<MatrixIndexes, MatrixBlock> call(Tuple2<MatrixIndexes, MatrixBlock> arg0)
        throws Exception {
      MatrixIndexes ixIn = arg0._1();
      MatrixBlock blkIn = arg0._2();

      MatrixIndexes ixOut = new MatrixIndexes();
      MatrixBlock blkOut = new MatrixBlock();

      // process instruction
      OperationsOnMatrixValues.performAggregateUnary(
          ixIn, blkIn, ixOut, blkOut, ((AggregateUnaryOperator) _op), _brlen, _bclen);
      if (((AggregateUnaryOperator) _op).aggOp.correctionExists)
        blkOut.dropLastRowsOrColums(((AggregateUnaryOperator) _op).aggOp.correctionLocation);

      // cumsum expand partial aggregates
      long rlenOut = (long) Math.ceil((double) _rlen / _brlen);
      long rixOut = (long) Math.ceil((double) ixIn.getRowIndex() / _brlen);
      int rlenBlk = (int) Math.min(rlenOut - (rixOut - 1) * _brlen, _brlen);
      int clenBlk = blkOut.getNumColumns();
      int posBlk = (int) ((ixIn.getRowIndex() - 1) % _brlen);
      MatrixBlock blkOut2 = new MatrixBlock(rlenBlk, clenBlk, false);
      blkOut2.copy(posBlk, posBlk, 0, clenBlk - 1, blkOut, true);
      ixOut.setIndexes(rixOut, ixOut.getColumnIndex());

      // output new tuple
      return new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut2);
    }
      @Override
      protected Tuple2<MatrixIndexes, MatrixBlock> computeNext(
          Tuple2<MatrixIndexes, MatrixBlock> arg) throws Exception {
        MatrixIndexes ixIn = arg._1();
        MatrixBlock blkIn = arg._2();
        MatrixBlock blkOut = new MatrixBlock();

        if (_type == CacheType.LEFT) {
          // get the right hand side matrix
          MatrixBlock left = _pbc.getMatrixBlock(1, (int) ixIn.getRowIndex());

          // execute index preserving matrix multiplication
          left.aggregateBinaryOperations(left, blkIn, blkOut, _op);
        } else // if( _type == CacheType.RIGHT )
        {
          // get the right hand side matrix
          MatrixBlock right = _pbc.getMatrixBlock((int) ixIn.getColumnIndex(), 1);

          // execute index preserving matrix multiplication
          blkIn.aggregateBinaryOperations(blkIn, right, blkOut, _op);
        }

        return new Tuple2<MatrixIndexes, MatrixBlock>(ixIn, blkOut);
      }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {
    ArrayList<IndexedMatrixValue> blkList = cachedValues.get(input);
    if (blkList == null) return;

    for (IndexedMatrixValue in1 : blkList) {
      if (in1 == null) continue;

      MatrixIndexes inix = in1.getIndexes();
      MatrixBlock blk = (MatrixBlock) in1.getValue();
      long rixOffset = (inix.getRowIndex() - 1) * blockRowFactor;
      boolean firstBlk = (inix.getRowIndex() == 1);
      boolean lastBlk = (inix.getRowIndex() == _lastRowBlockIndex);

      // introduce offsets w/ init value for first row
      if (firstBlk) {
        IndexedMatrixValue out = cachedValues.holdPlace(output, valueClass);
        ((MatrixBlock) out.getValue()).reset(1, blk.getNumColumns());
        if (_initValue != 0) {
          for (int j = 0; j < blk.getNumColumns(); j++)
            ((MatrixBlock) out.getValue()).appendValue(0, j, _initValue);
        }
        out.getIndexes().setIndexes(1, inix.getColumnIndex());
      }

      // output splitting (shift by one), preaggregated offset used by subsequent block
      for (int i = 0; i < blk.getNumRows(); i++)
        if (!(lastBlk && i == (blk.getNumRows() - 1))) // ignore last row
        {
          IndexedMatrixValue out = cachedValues.holdPlace(output, valueClass);
          MatrixBlock tmpBlk = (MatrixBlock) out.getValue();
          tmpBlk.reset(1, blk.getNumColumns());
          blk.sliceOperations(i, i, 0, blk.getNumColumns() - 1, tmpBlk);
          out.getIndexes().setIndexes(rixOffset + i + 2, inix.getColumnIndex());
        }
    }
  }
  /**
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   */
  @SuppressWarnings("deprecation")
  private void readBinaryCellMatrixFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      MatrixBlock dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException {
    boolean sparse = dest.isInSparseFormat();
    MatrixIndexes key = new MatrixIndexes();
    MatrixCell value = new MatrixCell();
    int row = -1;
    int col = -1;

    try {
      for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
      {
        // directly read from sequence files (individual partfiles)
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

        try {
          if (sparse) {
            while (reader.next(key, value)) {
              row = (int) key.getRowIndex() - 1;
              col = (int) key.getColumnIndex() - 1;
              double lvalue = value.getValue();
              dest.appendValue(row, col, lvalue);
            }
          } else {
            while (reader.next(key, value)) {
              row = (int) key.getRowIndex() - 1;
              col = (int) key.getColumnIndex() - 1;
              double lvalue = value.getValue();
              dest.appendValue(row, col, lvalue);
            }
          }
        } finally {
          IOUtilFunctions.closeSilently(reader);
        }
      }

      if (sparse) dest.sortSparseRows();
    } catch (Exception ex) {
      // post-mortem error handling and bounds checking
      if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
        throw new IOException(
            "Matrix cell ["
                + (row + 1)
                + ","
                + (col + 1)
                + "] "
                + "out of overall matrix range [1:"
                + rlen
                + ",1:"
                + clen
                + "].");
      } else {
        throw new IOException("Unable to read matrix in binary cell format.", ex);
      }
    }
  }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {
    QuaternaryOperator qop = (QuaternaryOperator) optr;

    ArrayList<IndexedMatrixValue> blkList = cachedValues.get(_input1);
    if (blkList != null)
      for (IndexedMatrixValue imv : blkList) {
        // Step 1: prepare inputs and output
        if (imv == null) continue;
        MatrixIndexes inIx = imv.getIndexes();
        MatrixValue inVal = imv.getValue();

        // allocate space for the output value
        IndexedMatrixValue iout = null;
        if (output == _input1) iout = tempValue;
        else iout = cachedValues.holdPlace(output, valueClass);

        MatrixIndexes outIx = iout.getIndexes();
        MatrixValue outVal = iout.getValue();

        // Step 2: get remaining inputs: Wij, Ui, Vj
        MatrixValue Xij = inVal;

        // get Wij if existing (null of WeightsType.NONE or WSigmoid any type)
        IndexedMatrixValue iWij = (_input4 != -1) ? cachedValues.getFirst(_input4) : null;
        MatrixValue Wij = (iWij != null) ? iWij.getValue() : null;
        if (null == Wij && qop.hasFourInputs()) {
          MatrixBlock mb = new MatrixBlock(1, 1, false);
          String[] parts = InstructionUtils.getInstructionParts(instString);
          mb.quickSetValue(0, 0, Double.valueOf(parts[4]));
          Wij = mb;
        }

        // get Ui and Vj, potentially through distributed cache
        MatrixValue Ui =
            (!_cacheU)
                ? cachedValues.getFirst(_input2).getValue() // U
                : MRBaseForCommonInstructions.dcValues
                    .get(_input2)
                    .getDataBlock((int) inIx.getRowIndex(), 1)
                    .getValue();
        MatrixValue Vj =
            (!_cacheV)
                ? cachedValues.getFirst(_input3).getValue() // t(V)
                : MRBaseForCommonInstructions.dcValues
                    .get(_input3)
                    .getDataBlock((int) inIx.getColumnIndex(), 1)
                    .getValue();
        // handle special input case: //V through shuffle -> t(V)
        if (Ui.getNumColumns() != Vj.getNumColumns()) {
          Vj =
              LibMatrixReorg.reorg(
                  (MatrixBlock) Vj,
                  new MatrixBlock(Vj.getNumColumns(), Vj.getNumRows(), Vj.isInSparseFormat()),
                  new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
        }

        // Step 3: process instruction
        Xij.quaternaryOperations(qop, Ui, Vj, Wij, outVal);

        // set output indexes

        if (qop.wtype1 != null || qop.wtype4 != null) outIx.setIndexes(1, 1); // wsloss
        else if (qop.wtype2 != null
            || qop.wtype5 != null
            || qop.wtype3 != null && qop.wtype3.isBasic())
          outIx.setIndexes(inIx); // wsigmoid/wdivmm-basic
        else { // wdivmm
          boolean left = qop.wtype3.isLeft();
          outIx.setIndexes(left ? inIx.getColumnIndex() : inIx.getRowIndex(), 1);
        }

        // put the output value in the cache
        if (iout == tempValue) cachedValues.add(output, iout);
      }
  }
 @Override
 public int compare(MatrixIndexes m1, MatrixIndexes m2) {
   return m1.compareTo(m2);
 }
  /**
   * @param in
   * @param ixrange
   * @param brlen
   * @param bclen
   * @param rlen
   * @param clen
   * @param outlist
   * @throws DMLRuntimeException
   */
  public static void performShift(
      IndexedMatrixValue in,
      IndexRange ixrange,
      int brlen,
      int bclen,
      long rlen,
      long clen,
      ArrayList<IndexedMatrixValue> outlist)
      throws DMLRuntimeException {
    MatrixIndexes ix = in.getIndexes();
    MatrixBlock mb = (MatrixBlock) in.getValue();

    long start_lhs_globalRowIndex = ixrange.rowStart + (ix.getRowIndex() - 1) * brlen;
    long start_lhs_globalColIndex = ixrange.colStart + (ix.getColumnIndex() - 1) * bclen;
    long end_lhs_globalRowIndex = start_lhs_globalRowIndex + mb.getNumRows() - 1;
    long end_lhs_globalColIndex = start_lhs_globalColIndex + mb.getNumColumns() - 1;

    long start_lhs_rowIndex = UtilFunctions.computeBlockIndex(start_lhs_globalRowIndex, brlen);
    long end_lhs_rowIndex = UtilFunctions.computeBlockIndex(end_lhs_globalRowIndex, brlen);
    long start_lhs_colIndex = UtilFunctions.computeBlockIndex(start_lhs_globalColIndex, bclen);
    long end_lhs_colIndex = UtilFunctions.computeBlockIndex(end_lhs_globalColIndex, bclen);

    for (long leftRowIndex = start_lhs_rowIndex; leftRowIndex <= end_lhs_rowIndex; leftRowIndex++) {
      for (long leftColIndex = start_lhs_colIndex;
          leftColIndex <= end_lhs_colIndex;
          leftColIndex++) {

        // Calculate global index of right hand side block
        long lhs_rl = Math.max((leftRowIndex - 1) * brlen + 1, start_lhs_globalRowIndex);
        long lhs_ru = Math.min(leftRowIndex * brlen, end_lhs_globalRowIndex);
        long lhs_cl = Math.max((leftColIndex - 1) * bclen + 1, start_lhs_globalColIndex);
        long lhs_cu = Math.min(leftColIndex * bclen, end_lhs_globalColIndex);

        int lhs_lrl = UtilFunctions.computeCellInBlock(lhs_rl, brlen);
        int lhs_lru = UtilFunctions.computeCellInBlock(lhs_ru, brlen);
        int lhs_lcl = UtilFunctions.computeCellInBlock(lhs_cl, bclen);
        int lhs_lcu = UtilFunctions.computeCellInBlock(lhs_cu, bclen);

        long rhs_rl = lhs_rl - ixrange.rowStart + 1;
        long rhs_ru = rhs_rl + (lhs_ru - lhs_rl);
        long rhs_cl = lhs_cl - ixrange.colStart + 1;
        long rhs_cu = rhs_cl + (lhs_cu - lhs_cl);

        int rhs_lrl = UtilFunctions.computeCellInBlock(rhs_rl, brlen);
        int rhs_lru = UtilFunctions.computeCellInBlock(rhs_ru, brlen);
        int rhs_lcl = UtilFunctions.computeCellInBlock(rhs_cl, bclen);
        int rhs_lcu = UtilFunctions.computeCellInBlock(rhs_cu, bclen);

        MatrixBlock slicedRHSBlk =
            mb.sliceOperations(rhs_lrl, rhs_lru, rhs_lcl, rhs_lcu, new MatrixBlock());

        int lbrlen = UtilFunctions.computeBlockSize(rlen, leftRowIndex, brlen);
        int lbclen = UtilFunctions.computeBlockSize(clen, leftColIndex, bclen);
        MatrixBlock resultBlock = new MatrixBlock(lbrlen, lbclen, false);
        resultBlock =
            resultBlock.leftIndexingOperations(
                slicedRHSBlk, lhs_lrl, lhs_lru, lhs_lcl, lhs_lcu, null, UpdateType.COPY);
        outlist.add(
            new IndexedMatrixValue(new MatrixIndexes(leftRowIndex, leftColIndex), resultBlock));
      }
    }
  }
  @SuppressWarnings("deprecation")
  public void flushBuffer(Reporter reporter) throws RuntimeException {
    try {
      if (_mapBuffer != null) {
        MatrixIndexes key = null; // new MatrixIndexes();
        MatrixCell value = new MatrixCell();
        for (Entry<Byte, CTableMap> ctable : _mapBuffer.entrySet()) {
          ArrayList<Integer> resultIDs =
              ReduceBase.getOutputIndexes(ctable.getKey(), _resultIndexes);
          CTableMap resultMap = ctable.getValue();

          // maintain result dims and nonzeros
          for (Integer i : resultIDs) {
            _resultNonZeros[i] += resultMap.size();
            if (_resultDimsUnknown[i] == (byte) 1) {
              _resultMaxRowDims[i] = Math.max(resultMap.getMaxRow(), _resultMaxRowDims[i]);
              _resultMaxColDims[i] = Math.max(resultMap.getMaxColumn(), _resultMaxColDims[i]);
            }
          }

          // output result data
          for (LLDoubleEntry e : resultMap.entrySet()) {
            key = new MatrixIndexes(e.key1, e.key2);
            value.setValue(e.value);
            for (Integer i : resultIDs) {
              _collector.collectOutput(key, value, i, reporter);
            }
          }
        }
      } else if (_blockBuffer != null) {
        MatrixIndexes key = new MatrixIndexes(1, 1);
        // DataConverter.writeBinaryBlockMatrixToHDFS(path, job, mat, mc.get_rows(), mc.get_cols(),
        // mc.get_rows_per_block(), mc.get_cols_per_block(), replication);
        for (Entry<Byte, MatrixBlock> ctable : _blockBuffer.entrySet()) {
          ArrayList<Integer> resultIDs =
              ReduceBase.getOutputIndexes(ctable.getKey(), _resultIndexes);
          MatrixBlock outBlock = ctable.getValue();
          outBlock.recomputeNonZeros();

          // TODO: change hard coding of 1000
          int brlen = 1000, bclen = 1000;
          int rlen = outBlock.getNumRows();
          int clen = outBlock.getNumColumns();

          // final output matrix is smaller than a single block
          if (rlen <= brlen && clen <= brlen) {
            key = new MatrixIndexes(1, 1);
            for (Integer i : resultIDs) {
              _collector.collectOutput(key, outBlock, i, reporter);
              _resultNonZeros[i] += outBlock.getNonZeros();
            }
          } else {
            // Following code is similar to that in
            // DataConverter.DataConverter.writeBinaryBlockMatrixToHDFS
            // initialize blocks for reuse (at most 4 different blocks required)
            MatrixBlock[] blocks =
                MatrixWriter.createMatrixBlocksForReuse(
                    rlen, clen, brlen, bclen, true, outBlock.getNonZeros());

            // create and write subblocks of matrix
            for (int blockRow = 0; blockRow < (int) Math.ceil(rlen / (double) brlen); blockRow++) {
              for (int blockCol = 0;
                  blockCol < (int) Math.ceil(clen / (double) bclen);
                  blockCol++) {
                int maxRow = (blockRow * brlen + brlen < rlen) ? brlen : rlen - blockRow * brlen;
                int maxCol = (blockCol * bclen + bclen < clen) ? bclen : clen - blockCol * bclen;

                int row_offset = blockRow * brlen;
                int col_offset = blockCol * bclen;

                // get reuse matrix block
                MatrixBlock block =
                    MatrixWriter.getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                // copy submatrix to block
                outBlock.sliceOperations(
                    row_offset,
                    row_offset + maxRow - 1,
                    col_offset,
                    col_offset + maxCol - 1,
                    block);

                // TODO: skip empty "block"

                // append block to sequence file
                key.setIndexes(blockRow + 1, blockCol + 1);
                for (Integer i : resultIDs) {
                  _collector.collectOutput(key, block, i, reporter);
                  _resultNonZeros[i] += block.getNonZeros();
                }

                // reset block for later reuse
                block.reset();
              }
            }
          }
        }
      } else {
        throw new DMLRuntimeException("Unexpected.. both ctable buffers are empty.");
      }
    } catch (Exception ex) {
      throw new RuntimeException("Failed to flush ctable buffer.", ex);
    }
    // remove existing partial ctables
    if (_mapBuffer != null) _mapBuffer.clear();
    else _blockBuffer.clear();
  }