@Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int brlen,
      int bclen)
      throws DMLUnsupportedOperationException, DMLRuntimeException {
    // get both inputs
    IndexedMatrixValue left = cachedValues.getFirst(input1);
    IndexedMatrixValue right = cachedValues.getFirst(input2);

    // check non-existing block
    if (left == null || right == null)
      throw new DMLRuntimeException(
          "Missing append input: isNull(left): "
              + (left == null)
              + ", isNull(right): "
              + (right == null));

    // core append operation
    MatrixBlock mbLeft = (MatrixBlock) left.getValue();
    MatrixBlock mbRight = (MatrixBlock) right.getValue();

    MatrixBlock ret = mbLeft.appendOperations(mbRight, new MatrixBlock(), _cbind);

    // put result into cache
    cachedValues.add(output, new IndexedMatrixValue(left.getIndexes(), ret));
  }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {
    ArrayList<IndexedMatrixValue> blkList = cachedValues.get(input);
    if (blkList == null) return;

    for (IndexedMatrixValue in1 : blkList) {
      if (in1 == null) continue;

      MatrixIndexes inix = in1.getIndexes();
      MatrixBlock blk = (MatrixBlock) in1.getValue();
      long rixOffset = (inix.getRowIndex() - 1) * blockRowFactor;
      boolean firstBlk = (inix.getRowIndex() == 1);
      boolean lastBlk = (inix.getRowIndex() == _lastRowBlockIndex);

      // introduce offsets w/ init value for first row
      if (firstBlk) {
        IndexedMatrixValue out = cachedValues.holdPlace(output, valueClass);
        ((MatrixBlock) out.getValue()).reset(1, blk.getNumColumns());
        if (_initValue != 0) {
          for (int j = 0; j < blk.getNumColumns(); j++)
            ((MatrixBlock) out.getValue()).appendValue(0, j, _initValue);
        }
        out.getIndexes().setIndexes(1, inix.getColumnIndex());
      }

      // output splitting (shift by one), preaggregated offset used by subsequent block
      for (int i = 0; i < blk.getNumRows(); i++)
        if (!(lastBlk && i == (blk.getNumRows() - 1))) // ignore last row
        {
          IndexedMatrixValue out = cachedValues.holdPlace(output, valueClass);
          MatrixBlock tmpBlk = (MatrixBlock) out.getValue();
          tmpBlk.reset(1, blk.getNumColumns());
          blk.sliceOperations(i, i, 0, blk.getNumColumns() - 1, tmpBlk);
          out.getIndexes().setIndexes(rixOffset + i + 2, inix.getColumnIndex());
        }
    }
  }
  /**
   * @param target
   * @param groups
   * @param brlen
   * @param bclen
   * @param outlist
   * @throws DMLRuntimeException
   */
  public static void performMapGroupedAggregate(
      Operator op,
      IndexedMatrixValue inTarget,
      MatrixBlock groups,
      int ngroups,
      int brlen,
      int bclen,
      ArrayList<IndexedMatrixValue> outlist)
      throws DMLRuntimeException {
    MatrixIndexes ix = inTarget.getIndexes();
    MatrixBlock target = (MatrixBlock) inTarget.getValue();

    // execute grouped aggregate operations
    MatrixBlock out = groups.groupedAggOperations(target, null, new MatrixBlock(), ngroups, op);

    if (out.getNumRows() <= brlen && out.getNumColumns() <= bclen) {
      // single output block
      outlist.add(new IndexedMatrixValue(new MatrixIndexes(1, ix.getColumnIndex()), out));
    } else {
      // multiple output blocks (by op def, single column block )
      for (int blockRow = 0;
          blockRow < (int) Math.ceil(out.getNumRows() / (double) brlen);
          blockRow++) {
        int maxRow =
            (blockRow * brlen + brlen < out.getNumRows())
                ? brlen
                : out.getNumRows() - blockRow * brlen;
        int row_offset = blockRow * brlen;

        // copy submatrix to block
        MatrixBlock tmp =
            out.sliceOperations(
                row_offset, row_offset + maxRow - 1, 0, out.getNumColumns() - 1, new MatrixBlock());

        // append block to result cache
        outlist.add(
            new IndexedMatrixValue(new MatrixIndexes(blockRow + 1, ix.getColumnIndex()), tmp));
      }
    }
  }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {
    IndexedMatrixValue in1 = cachedValues.getFirst(input1); // original data
    IndexedMatrixValue in2 = cachedValues.getFirst(input2); // offset row vector

    if (in1 == null || in2 == null)
      throw new DMLRuntimeException(
          "Unexpected empty input (left="
              + ((in1 == null) ? "null" : in1.getIndexes())
              + ", right="
              + ((in2 == null) ? "null" : in2.getIndexes())
              + ").");

    // prepare inputs and outputs
    IndexedMatrixValue out = cachedValues.holdPlace(output, valueClass);
    MatrixBlock data = (MatrixBlock) in1.getValue();
    MatrixBlock offset = (MatrixBlock) in2.getValue();
    MatrixBlock blk = (MatrixBlock) out.getValue();
    blk.reset(data.getNumRows(), data.getNumColumns());

    // blockwise offset aggregation and prefix sum computation
    MatrixBlock data2 = new MatrixBlock(data); // cp data
    MatrixBlock fdata2 =
        data2.sliceOperations(0, 0, 0, data2.getNumColumns() - 1, new MatrixBlock()); // 1-based
    fdata2.binaryOperationsInPlace(_bop, offset); // sum offset to first row
    data2.copy(0, 0, 0, data2.getNumColumns() - 1, fdata2, true); // 0-based
    data2.unaryOperations(_uop, blk); // compute columnwise prefix sums/prod/min/max

    // set output indexes
    out.getIndexes().setIndexes(in1.getIndexes());
  }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLUnsupportedOperationException, DMLRuntimeException {

    ArrayList<IndexedMatrixValue> blkList = cachedValues.get(input);
    if (blkList != null)
      for (IndexedMatrixValue in : blkList) {
        if (in == null) continue;

        // allocate space for the output value
        IndexedMatrixValue out;
        if (input == output) out = tempValue;
        else out = cachedValues.holdPlace(output, valueClass);

        MatrixIndexes inix = in.getIndexes();

        // prune unnecessary blocks for trace
        if ((((AggregateUnaryOperator) optr).indexFn instanceof ReduceDiag
            && inix.getColumnIndex() != inix.getRowIndex())) {
          // do nothing (block not on diagonal); but reset
          out.getValue().reset();
        } else // general case
        {
          // process instruction
          AggregateUnaryOperator auop = (AggregateUnaryOperator) optr;
          OperationsOnMatrixValues.performAggregateUnary(
              inix,
              in.getValue(),
              out.getIndexes(),
              out.getValue(),
              auop,
              blockRowFactor,
              blockColFactor);
          if (_dropCorr)
            ((MatrixBlock) out.getValue()).dropLastRowsOrColums(auop.aggOp.correctionLocation);
        }

        // put the output value in the cache
        if (out == tempValue) cachedValues.add(output, out);
      }
  }
  // Reused by both MR and Spark for performing zero out
  public static IndexRange getSelectedRangeForZeroOut(
      IndexedMatrixValue in, int blockRowFactor, int blockColFactor, IndexRange indexRange) {
    IndexRange tempRange = new IndexRange(-1, -1, -1, -1);

    long topBlockRowIndex = UtilFunctions.computeBlockIndex(indexRange.rowStart, blockRowFactor);
    int topRowInTopBlock = UtilFunctions.computeCellInBlock(indexRange.rowStart, blockRowFactor);
    long bottomBlockRowIndex = UtilFunctions.computeBlockIndex(indexRange.rowEnd, blockRowFactor);
    int bottomRowInBottomBlock =
        UtilFunctions.computeCellInBlock(indexRange.rowEnd, blockRowFactor);

    long leftBlockColIndex = UtilFunctions.computeBlockIndex(indexRange.colStart, blockColFactor);
    int leftColInLeftBlock = UtilFunctions.computeCellInBlock(indexRange.colStart, blockColFactor);
    long rightBlockColIndex = UtilFunctions.computeBlockIndex(indexRange.colEnd, blockColFactor);
    int rightColInRightBlock = UtilFunctions.computeCellInBlock(indexRange.colEnd, blockColFactor);

    // no overlap
    if (in.getIndexes().getRowIndex() < topBlockRowIndex
        || in.getIndexes().getRowIndex() > bottomBlockRowIndex
        || in.getIndexes().getColumnIndex() < leftBlockColIndex
        || in.getIndexes().getColumnIndex() > rightBlockColIndex) {
      tempRange.set(-1, -1, -1, -1);
      return tempRange;
    }

    // get the index range inside the block
    tempRange.set(0, in.getValue().getNumRows() - 1, 0, in.getValue().getNumColumns() - 1);
    if (topBlockRowIndex == in.getIndexes().getRowIndex()) tempRange.rowStart = topRowInTopBlock;
    if (bottomBlockRowIndex == in.getIndexes().getRowIndex())
      tempRange.rowEnd = bottomRowInBottomBlock;
    if (leftBlockColIndex == in.getIndexes().getColumnIndex())
      tempRange.colStart = leftColInLeftBlock;
    if (rightBlockColIndex == in.getIndexes().getColumnIndex())
      tempRange.colEnd = rightColInRightBlock;

    return tempRange;
  }
  /**
   * @param in
   * @param ixrange
   * @param brlen
   * @param bclen
   * @param rlen
   * @param clen
   * @param outlist
   * @throws DMLRuntimeException
   */
  public static void performShift(
      IndexedMatrixValue in,
      IndexRange ixrange,
      int brlen,
      int bclen,
      long rlen,
      long clen,
      ArrayList<IndexedMatrixValue> outlist)
      throws DMLRuntimeException {
    MatrixIndexes ix = in.getIndexes();
    MatrixBlock mb = (MatrixBlock) in.getValue();

    long start_lhs_globalRowIndex = ixrange.rowStart + (ix.getRowIndex() - 1) * brlen;
    long start_lhs_globalColIndex = ixrange.colStart + (ix.getColumnIndex() - 1) * bclen;
    long end_lhs_globalRowIndex = start_lhs_globalRowIndex + mb.getNumRows() - 1;
    long end_lhs_globalColIndex = start_lhs_globalColIndex + mb.getNumColumns() - 1;

    long start_lhs_rowIndex = UtilFunctions.computeBlockIndex(start_lhs_globalRowIndex, brlen);
    long end_lhs_rowIndex = UtilFunctions.computeBlockIndex(end_lhs_globalRowIndex, brlen);
    long start_lhs_colIndex = UtilFunctions.computeBlockIndex(start_lhs_globalColIndex, bclen);
    long end_lhs_colIndex = UtilFunctions.computeBlockIndex(end_lhs_globalColIndex, bclen);

    for (long leftRowIndex = start_lhs_rowIndex; leftRowIndex <= end_lhs_rowIndex; leftRowIndex++) {
      for (long leftColIndex = start_lhs_colIndex;
          leftColIndex <= end_lhs_colIndex;
          leftColIndex++) {

        // Calculate global index of right hand side block
        long lhs_rl = Math.max((leftRowIndex - 1) * brlen + 1, start_lhs_globalRowIndex);
        long lhs_ru = Math.min(leftRowIndex * brlen, end_lhs_globalRowIndex);
        long lhs_cl = Math.max((leftColIndex - 1) * bclen + 1, start_lhs_globalColIndex);
        long lhs_cu = Math.min(leftColIndex * bclen, end_lhs_globalColIndex);

        int lhs_lrl = UtilFunctions.computeCellInBlock(lhs_rl, brlen);
        int lhs_lru = UtilFunctions.computeCellInBlock(lhs_ru, brlen);
        int lhs_lcl = UtilFunctions.computeCellInBlock(lhs_cl, bclen);
        int lhs_lcu = UtilFunctions.computeCellInBlock(lhs_cu, bclen);

        long rhs_rl = lhs_rl - ixrange.rowStart + 1;
        long rhs_ru = rhs_rl + (lhs_ru - lhs_rl);
        long rhs_cl = lhs_cl - ixrange.colStart + 1;
        long rhs_cu = rhs_cl + (lhs_cu - lhs_cl);

        int rhs_lrl = UtilFunctions.computeCellInBlock(rhs_rl, brlen);
        int rhs_lru = UtilFunctions.computeCellInBlock(rhs_ru, brlen);
        int rhs_lcl = UtilFunctions.computeCellInBlock(rhs_cl, bclen);
        int rhs_lcu = UtilFunctions.computeCellInBlock(rhs_cu, bclen);

        MatrixBlock slicedRHSBlk =
            mb.sliceOperations(rhs_lrl, rhs_lru, rhs_lcl, rhs_lcu, new MatrixBlock());

        int lbrlen = UtilFunctions.computeBlockSize(rlen, leftRowIndex, brlen);
        int lbclen = UtilFunctions.computeBlockSize(clen, leftColIndex, bclen);
        MatrixBlock resultBlock = new MatrixBlock(lbrlen, lbclen, false);
        resultBlock =
            resultBlock.leftIndexingOperations(
                slicedRHSBlk, lhs_lrl, lhs_lru, lhs_lcl, lhs_lcu, null, UpdateType.COPY);
        outlist.add(
            new IndexedMatrixValue(new MatrixIndexes(leftRowIndex, leftColIndex), resultBlock));
      }
    }
  }
  /**
   * @param val
   * @param range
   * @param brlen
   * @param bclen
   * @param outlist
   * @throws DMLRuntimeException
   */
  public static void performSlice(
      IndexedMatrixValue in,
      IndexRange ixrange,
      int brlen,
      int bclen,
      ArrayList<IndexedMatrixValue> outlist)
      throws DMLRuntimeException {
    long cellIndexTopRow = UtilFunctions.computeCellIndex(in.getIndexes().getRowIndex(), brlen, 0);
    long cellIndexBottomRow =
        UtilFunctions.computeCellIndex(
            in.getIndexes().getRowIndex(), brlen, in.getValue().getNumRows() - 1);
    long cellIndexLeftCol =
        UtilFunctions.computeCellIndex(in.getIndexes().getColumnIndex(), bclen, 0);
    long cellIndexRightCol =
        UtilFunctions.computeCellIndex(
            in.getIndexes().getColumnIndex(), bclen, in.getValue().getNumColumns() - 1);

    long cellIndexOverlapTop = Math.max(cellIndexTopRow, ixrange.rowStart);
    long cellIndexOverlapBottom = Math.min(cellIndexBottomRow, ixrange.rowEnd);
    long cellIndexOverlapLeft = Math.max(cellIndexLeftCol, ixrange.colStart);
    long cellIndexOverlapRight = Math.min(cellIndexRightCol, ixrange.colEnd);

    // check if block is outside the indexing range
    if (cellIndexOverlapTop > cellIndexOverlapBottom
        || cellIndexOverlapLeft > cellIndexOverlapRight) {
      return;
    }

    IndexRange tmpRange =
        new IndexRange(
            UtilFunctions.computeCellInBlock(cellIndexOverlapTop, brlen),
            UtilFunctions.computeCellInBlock(cellIndexOverlapBottom, brlen),
            UtilFunctions.computeCellInBlock(cellIndexOverlapLeft, bclen),
            UtilFunctions.computeCellInBlock(cellIndexOverlapRight, bclen));

    int rowCut = UtilFunctions.computeCellInBlock(ixrange.rowStart, brlen);
    int colCut = UtilFunctions.computeCellInBlock(ixrange.colStart, bclen);

    int rowsInLastBlock = (int) ((ixrange.rowEnd - ixrange.rowStart + 1) % brlen);
    if (rowsInLastBlock == 0) rowsInLastBlock = brlen;
    int colsInLastBlock = (int) ((ixrange.colEnd - ixrange.colStart + 1) % bclen);
    if (colsInLastBlock == 0) colsInLastBlock = bclen;

    long resultBlockIndexTop =
        UtilFunctions.computeBlockIndex(cellIndexOverlapTop - ixrange.rowStart + 1, brlen);
    long resultBlockIndexBottom =
        UtilFunctions.computeBlockIndex(cellIndexOverlapBottom - ixrange.rowStart + 1, brlen);
    long resultBlockIndexLeft =
        UtilFunctions.computeBlockIndex(cellIndexOverlapLeft - ixrange.colStart + 1, bclen);
    long resultBlockIndexRight =
        UtilFunctions.computeBlockIndex(cellIndexOverlapRight - ixrange.colStart + 1, bclen);

    int boundaryRlen = brlen;
    int boundaryClen = bclen;
    long finalBlockIndexBottom =
        UtilFunctions.computeBlockIndex(ixrange.rowEnd - ixrange.rowStart + 1, brlen);
    long finalBlockIndexRight =
        UtilFunctions.computeBlockIndex(ixrange.colEnd - ixrange.colStart + 1, bclen);
    if (resultBlockIndexBottom == finalBlockIndexBottom) boundaryRlen = rowsInLastBlock;
    if (resultBlockIndexRight == finalBlockIndexRight) boundaryClen = colsInLastBlock;

    // allocate space for the output value
    for (long r = resultBlockIndexTop; r <= resultBlockIndexBottom; r++)
      for (long c = resultBlockIndexLeft; c <= resultBlockIndexRight; c++) {
        IndexedMatrixValue out = new IndexedMatrixValue(new MatrixIndexes(), new MatrixBlock());
        out.getIndexes().setIndexes(r, c);
        outlist.add(out);
      }

    // execute actual slice operation
    in.getValue()
        .sliceOperations(
            outlist, tmpRange, rowCut, colCut, brlen, bclen, boundaryRlen, boundaryClen);
  }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {

    IndexedMatrixValue in1 = cachedValues.getFirst(input1);
    IndexedMatrixValue in2 = cachedValues.getFirst(input2);
    if (in1 == null && in2 == null) return;

    // allocate space for the output value
    // try to avoid coping as much as possible
    IndexedMatrixValue out;
    if ((output != input1 && output != input2)
        || (output == input1 && in1 == null)
        || (output == input2 && in2 == null)) out = cachedValues.holdPlace(output, valueClass);
    else out = tempValue;

    // if one of the inputs is null, then it is a all zero block
    MatrixIndexes finalIndexes = null;
    if (in1 == null) {
      in1 = zeroInput;
      in1.getValue().reset(in2.getValue().getNumRows(), in2.getValue().getNumColumns());
      finalIndexes = in2.getIndexes();
    } else finalIndexes = in1.getIndexes();

    if (in2 == null) {
      in2 = zeroInput;
      in2.getValue().reset(in1.getValue().getNumRows(), in1.getValue().getNumColumns());
    }

    // process instruction
    out.getIndexes().setIndexes(finalIndexes);
    OperationsOnMatrixValues.performBinaryIgnoreIndexes(
        in1.getValue(), in2.getValue(), out.getValue(), ((BinaryOperator) optr));

    // put the output value in the cache
    if (out == tempValue) cachedValues.add(output, out);
  }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {
    QuaternaryOperator qop = (QuaternaryOperator) optr;

    ArrayList<IndexedMatrixValue> blkList = cachedValues.get(_input1);
    if (blkList != null)
      for (IndexedMatrixValue imv : blkList) {
        // Step 1: prepare inputs and output
        if (imv == null) continue;
        MatrixIndexes inIx = imv.getIndexes();
        MatrixValue inVal = imv.getValue();

        // allocate space for the output value
        IndexedMatrixValue iout = null;
        if (output == _input1) iout = tempValue;
        else iout = cachedValues.holdPlace(output, valueClass);

        MatrixIndexes outIx = iout.getIndexes();
        MatrixValue outVal = iout.getValue();

        // Step 2: get remaining inputs: Wij, Ui, Vj
        MatrixValue Xij = inVal;

        // get Wij if existing (null of WeightsType.NONE or WSigmoid any type)
        IndexedMatrixValue iWij = (_input4 != -1) ? cachedValues.getFirst(_input4) : null;
        MatrixValue Wij = (iWij != null) ? iWij.getValue() : null;
        if (null == Wij && qop.hasFourInputs()) {
          MatrixBlock mb = new MatrixBlock(1, 1, false);
          String[] parts = InstructionUtils.getInstructionParts(instString);
          mb.quickSetValue(0, 0, Double.valueOf(parts[4]));
          Wij = mb;
        }

        // get Ui and Vj, potentially through distributed cache
        MatrixValue Ui =
            (!_cacheU)
                ? cachedValues.getFirst(_input2).getValue() // U
                : MRBaseForCommonInstructions.dcValues
                    .get(_input2)
                    .getDataBlock((int) inIx.getRowIndex(), 1)
                    .getValue();
        MatrixValue Vj =
            (!_cacheV)
                ? cachedValues.getFirst(_input3).getValue() // t(V)
                : MRBaseForCommonInstructions.dcValues
                    .get(_input3)
                    .getDataBlock((int) inIx.getColumnIndex(), 1)
                    .getValue();
        // handle special input case: //V through shuffle -> t(V)
        if (Ui.getNumColumns() != Vj.getNumColumns()) {
          Vj =
              LibMatrixReorg.reorg(
                  (MatrixBlock) Vj,
                  new MatrixBlock(Vj.getNumColumns(), Vj.getNumRows(), Vj.isInSparseFormat()),
                  new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
        }

        // Step 3: process instruction
        Xij.quaternaryOperations(qop, Ui, Vj, Wij, outVal);

        // set output indexes

        if (qop.wtype1 != null || qop.wtype4 != null) outIx.setIndexes(1, 1); // wsloss
        else if (qop.wtype2 != null
            || qop.wtype5 != null
            || qop.wtype3 != null && qop.wtype3.isBasic())
          outIx.setIndexes(inIx); // wsigmoid/wdivmm-basic
        else { // wdivmm
          boolean left = qop.wtype3.isLeft();
          outIx.setIndexes(left ? inIx.getColumnIndex() : inIx.getRowIndex(), 1);
        }

        // put the output value in the cache
        if (iout == tempValue) cachedValues.add(output, iout);
      }
  }