@Override
    public Tuple2<MatrixIndexes, MatrixBlock> call(Tuple2<MatrixIndexes, MatrixBlock> arg0)
        throws Exception {
      MatrixIndexes ixIn = arg0._1();
      MatrixBlock blkIn = arg0._2();

      MatrixIndexes ixOut = new MatrixIndexes();
      MatrixBlock blkOut = new MatrixBlock();

      // process instruction
      OperationsOnMatrixValues.performAggregateUnary(
          ixIn, blkIn, ixOut, blkOut, ((AggregateUnaryOperator) _op), _brlen, _bclen);
      if (((AggregateUnaryOperator) _op).aggOp.correctionExists)
        blkOut.dropLastRowsOrColums(((AggregateUnaryOperator) _op).aggOp.correctionLocation);

      // cumsum expand partial aggregates
      long rlenOut = (long) Math.ceil((double) _rlen / _brlen);
      long rixOut = (long) Math.ceil((double) ixIn.getRowIndex() / _brlen);
      int rlenBlk = (int) Math.min(rlenOut - (rixOut - 1) * _brlen, _brlen);
      int clenBlk = blkOut.getNumColumns();
      int posBlk = (int) ((ixIn.getRowIndex() - 1) % _brlen);
      MatrixBlock blkOut2 = new MatrixBlock(rlenBlk, clenBlk, false);
      blkOut2.copy(posBlk, posBlk, 0, clenBlk - 1, blkOut, true);
      ixOut.setIndexes(rixOut, ixOut.getColumnIndex());

      // output new tuple
      return new Tuple2<MatrixIndexes, MatrixBlock>(ixOut, blkOut2);
    }
 public static void performZeroOut(
     MatrixIndexes indexesIn,
     MatrixValue valueIn,
     MatrixIndexes indexesOut,
     MatrixValue valueOut,
     IndexRange range,
     boolean complementary)
     throws DMLRuntimeException {
   valueIn.zeroOutOperations(valueOut, range, complementary);
   indexesOut.setIndexes(indexesIn);
 }
  public static void performAggregateBinary(
      MatrixIndexes indexes1,
      MatrixValue value1,
      MatrixIndexes indexes2,
      MatrixValue value2,
      MatrixIndexes indexesOut,
      MatrixValue valueOut,
      AggregateBinaryOperator op)
      throws DMLRuntimeException {
    // compute output index
    indexesOut.setIndexes(indexes1.getRowIndex(), indexes2.getColumnIndex());

    // perform on the value
    value1.aggregateBinaryOperations(indexes1, value1, indexes2, value2, valueOut, op);
  }
 public MatrixIndexes(MatrixIndexes indexes) {
   setIndexes(indexes._row, indexes._col);
 }
 public MatrixIndexes(long r, long c) {
   setIndexes(r, c);
 }
  @SuppressWarnings("deprecation")
  public void flushBuffer(Reporter reporter) throws RuntimeException {
    try {
      if (_mapBuffer != null) {
        MatrixIndexes key = null; // new MatrixIndexes();
        MatrixCell value = new MatrixCell();
        for (Entry<Byte, CTableMap> ctable : _mapBuffer.entrySet()) {
          ArrayList<Integer> resultIDs =
              ReduceBase.getOutputIndexes(ctable.getKey(), _resultIndexes);
          CTableMap resultMap = ctable.getValue();

          // maintain result dims and nonzeros
          for (Integer i : resultIDs) {
            _resultNonZeros[i] += resultMap.size();
            if (_resultDimsUnknown[i] == (byte) 1) {
              _resultMaxRowDims[i] = Math.max(resultMap.getMaxRow(), _resultMaxRowDims[i]);
              _resultMaxColDims[i] = Math.max(resultMap.getMaxColumn(), _resultMaxColDims[i]);
            }
          }

          // output result data
          for (LLDoubleEntry e : resultMap.entrySet()) {
            key = new MatrixIndexes(e.key1, e.key2);
            value.setValue(e.value);
            for (Integer i : resultIDs) {
              _collector.collectOutput(key, value, i, reporter);
            }
          }
        }
      } else if (_blockBuffer != null) {
        MatrixIndexes key = new MatrixIndexes(1, 1);
        // DataConverter.writeBinaryBlockMatrixToHDFS(path, job, mat, mc.get_rows(), mc.get_cols(),
        // mc.get_rows_per_block(), mc.get_cols_per_block(), replication);
        for (Entry<Byte, MatrixBlock> ctable : _blockBuffer.entrySet()) {
          ArrayList<Integer> resultIDs =
              ReduceBase.getOutputIndexes(ctable.getKey(), _resultIndexes);
          MatrixBlock outBlock = ctable.getValue();
          outBlock.recomputeNonZeros();

          // TODO: change hard coding of 1000
          int brlen = 1000, bclen = 1000;
          int rlen = outBlock.getNumRows();
          int clen = outBlock.getNumColumns();

          // final output matrix is smaller than a single block
          if (rlen <= brlen && clen <= brlen) {
            key = new MatrixIndexes(1, 1);
            for (Integer i : resultIDs) {
              _collector.collectOutput(key, outBlock, i, reporter);
              _resultNonZeros[i] += outBlock.getNonZeros();
            }
          } else {
            // Following code is similar to that in
            // DataConverter.DataConverter.writeBinaryBlockMatrixToHDFS
            // initialize blocks for reuse (at most 4 different blocks required)
            MatrixBlock[] blocks =
                MatrixWriter.createMatrixBlocksForReuse(
                    rlen, clen, brlen, bclen, true, outBlock.getNonZeros());

            // create and write subblocks of matrix
            for (int blockRow = 0; blockRow < (int) Math.ceil(rlen / (double) brlen); blockRow++) {
              for (int blockCol = 0;
                  blockCol < (int) Math.ceil(clen / (double) bclen);
                  blockCol++) {
                int maxRow = (blockRow * brlen + brlen < rlen) ? brlen : rlen - blockRow * brlen;
                int maxCol = (blockCol * bclen + bclen < clen) ? bclen : clen - blockCol * bclen;

                int row_offset = blockRow * brlen;
                int col_offset = blockCol * bclen;

                // get reuse matrix block
                MatrixBlock block =
                    MatrixWriter.getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                // copy submatrix to block
                outBlock.sliceOperations(
                    row_offset,
                    row_offset + maxRow - 1,
                    col_offset,
                    col_offset + maxCol - 1,
                    block);

                // TODO: skip empty "block"

                // append block to sequence file
                key.setIndexes(blockRow + 1, blockCol + 1);
                for (Integer i : resultIDs) {
                  _collector.collectOutput(key, block, i, reporter);
                  _resultNonZeros[i] += block.getNonZeros();
                }

                // reset block for later reuse
                block.reset();
              }
            }
          }
        }
      } else {
        throw new DMLRuntimeException("Unexpected.. both ctable buffers are empty.");
      }
    } catch (Exception ex) {
      throw new RuntimeException("Failed to flush ctable buffer.", ex);
    }
    // remove existing partial ctables
    if (_mapBuffer != null) _mapBuffer.clear();
    else _blockBuffer.clear();
  }
  @Override
  public void processInstruction(
      Class<? extends MatrixValue> valueClass,
      CachedValueMap cachedValues,
      IndexedMatrixValue tempValue,
      IndexedMatrixValue zeroInput,
      int blockRowFactor,
      int blockColFactor)
      throws DMLRuntimeException {
    QuaternaryOperator qop = (QuaternaryOperator) optr;

    ArrayList<IndexedMatrixValue> blkList = cachedValues.get(_input1);
    if (blkList != null)
      for (IndexedMatrixValue imv : blkList) {
        // Step 1: prepare inputs and output
        if (imv == null) continue;
        MatrixIndexes inIx = imv.getIndexes();
        MatrixValue inVal = imv.getValue();

        // allocate space for the output value
        IndexedMatrixValue iout = null;
        if (output == _input1) iout = tempValue;
        else iout = cachedValues.holdPlace(output, valueClass);

        MatrixIndexes outIx = iout.getIndexes();
        MatrixValue outVal = iout.getValue();

        // Step 2: get remaining inputs: Wij, Ui, Vj
        MatrixValue Xij = inVal;

        // get Wij if existing (null of WeightsType.NONE or WSigmoid any type)
        IndexedMatrixValue iWij = (_input4 != -1) ? cachedValues.getFirst(_input4) : null;
        MatrixValue Wij = (iWij != null) ? iWij.getValue() : null;
        if (null == Wij && qop.hasFourInputs()) {
          MatrixBlock mb = new MatrixBlock(1, 1, false);
          String[] parts = InstructionUtils.getInstructionParts(instString);
          mb.quickSetValue(0, 0, Double.valueOf(parts[4]));
          Wij = mb;
        }

        // get Ui and Vj, potentially through distributed cache
        MatrixValue Ui =
            (!_cacheU)
                ? cachedValues.getFirst(_input2).getValue() // U
                : MRBaseForCommonInstructions.dcValues
                    .get(_input2)
                    .getDataBlock((int) inIx.getRowIndex(), 1)
                    .getValue();
        MatrixValue Vj =
            (!_cacheV)
                ? cachedValues.getFirst(_input3).getValue() // t(V)
                : MRBaseForCommonInstructions.dcValues
                    .get(_input3)
                    .getDataBlock((int) inIx.getColumnIndex(), 1)
                    .getValue();
        // handle special input case: //V through shuffle -> t(V)
        if (Ui.getNumColumns() != Vj.getNumColumns()) {
          Vj =
              LibMatrixReorg.reorg(
                  (MatrixBlock) Vj,
                  new MatrixBlock(Vj.getNumColumns(), Vj.getNumRows(), Vj.isInSparseFormat()),
                  new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
        }

        // Step 3: process instruction
        Xij.quaternaryOperations(qop, Ui, Vj, Wij, outVal);

        // set output indexes

        if (qop.wtype1 != null || qop.wtype4 != null) outIx.setIndexes(1, 1); // wsloss
        else if (qop.wtype2 != null
            || qop.wtype5 != null
            || qop.wtype3 != null && qop.wtype3.isBasic())
          outIx.setIndexes(inIx); // wsigmoid/wdivmm-basic
        else { // wdivmm
          boolean left = qop.wtype3.isLeft();
          outIx.setIndexes(left ? inIx.getColumnIndex() : inIx.getRowIndex(), 1);
        }

        // put the output value in the cache
        if (iout == tempValue) cachedValues.add(output, iout);
      }
  }