@Override
    public Tuple2<MatrixIndexes, MatrixBlock> call(Tuple2<MatrixIndexes, MatrixBlock> arg0)
        throws Exception {
      MatrixIndexes ixIn = arg0._1();
      MatrixBlock blkIn = arg0._2();

      MatrixIndexes ixOut = new MatrixIndexes();
      MatrixBlock blkOut = new MatrixBlock();

      // process instruction
      OperationsOnMatrixValues.performAggregateUnary(
          ixIn, blkIn, ixOut, blkOut, ((AggregateUnaryOperator) _op), _brlen, _bclen);
      if (((AggregateUnaryOperator) _op).aggOp.correctionExists)
        blkOut.dropLastRowsOrColums(((AggregateUnaryOperator) _op).aggOp.correctionLocation);

      // cumsum expand partial aggregates
      long rlenOut = (long) Math.ceil((double) _rlen / _brlen);
      long rixOut = (long) Math.ceil((double) ixIn.getRowIndex() / _brlen);
      int rlenBlk = (int) Math.min(rlenOut - (rixOut - 1) * _brlen, _brlen);
      int clenBlk = blkOut.getNumColumns();
      int posBlk = (int) ((ixIn.getRowIndex() - 1) % _brlen);
      MatrixBlock blkOut2 = new MatrixBlock(rlenBlk, clenBlk, false);
      blkOut2.copy(posBlk, posBlk, 0, clenBlk - 1, blkOut, true);
      ixOut.setIndexes(rixOut, ixOut.getColumnIndex());

      // output new tuple
      return new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut2);
    }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {
    IndexedMatrixValue in1 = cachedValues.getFirst(input1); // original data
    IndexedMatrixValue in2 = cachedValues.getFirst(input2); // offset row vector

    if (in1 == null || in2 == null)
      throw new DMLRuntimeException(
          "Unexpected empty input (left="
              + ((in1 == null) ? "null" : in1.getIndexes())
              + ", right="
              + ((in2 == null) ? "null" : in2.getIndexes())
              + ").");

    // prepare inputs and outputs
    IndexedMatrixValue out = cachedValues.holdPlace(output, valueClass);
    MatrixBlock data = (MatrixBlock) in1.getValue();
    MatrixBlock offset = (MatrixBlock) in2.getValue();
    MatrixBlock blk = (MatrixBlock) out.getValue();
    blk.reset(data.getNumRows(), data.getNumColumns());

    // blockwise offset aggregation and prefix sum computation
    MatrixBlock data2 = new MatrixBlock(data); // cp data
    MatrixBlock fdata2 =
        data2.sliceOperations(0, 0, 0, data2.getNumColumns() - 1, new MatrixBlock()); // 1-based
    fdata2.binaryOperationsInPlace(_bop, offset); // sum offset to first row
    data2.copy(0, 0, 0, data2.getNumColumns() - 1, fdata2, true); // 0-based
    data2.unaryOperations(_uop, blk); // compute columnwise prefix sums/prod/min/max

    // set output indexes
    out.getIndexes().setIndexes(in1.getIndexes());
  }