コード例 #1
0
  public static MRInstruction parseSingleInstruction(MRINSTRUCTION_TYPE mrtype, String str)
      throws DMLUnsupportedOperationException, DMLRuntimeException {
    if (str == null || str.isEmpty()) return null;

    switch (mrtype) {
      case Aggregate:
        return AggregateInstruction.parseInstruction(str);

      case ArithmeticBinary:
        {
          String opcode = InstructionUtils.getOpCode(str);
          String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
          // extract datatypes of first and second input operands
          String dt1 =
              parts[1].split(Instruction.DATATYPE_PREFIX)[1].split(Instruction.VALUETYPE_PREFIX)[0];
          String dt2 =
              parts[2].split(Instruction.DATATYPE_PREFIX)[1].split(Instruction.VALUETYPE_PREFIX)[0];
          if (dt1.equalsIgnoreCase("SCALAR") || dt2.equalsIgnoreCase("SCALAR")) {
            return ScalarInstruction.parseInstruction(str);
          } else {
            if (BinaryM.isOpcode(opcode)) return BinaryMInstruction.parseInstruction(str);
            else return BinaryInstruction.parseInstruction(str);
          }
        }

      case AggregateBinary:
        return AggregateBinaryInstruction.parseInstruction(str);

      case AggregateUnary:
        return AggregateUnaryInstruction.parseInstruction(str);

      case Ternary:
        return TernaryInstruction.parseInstruction(str);

      case Quaternary:
        return QuaternaryInstruction.parseInstruction(str);

      case Rand:
        return RandInstruction.parseInstruction(str);

      case Seq:
        return SeqInstruction.parseInstruction(str);

      case Reblock:
        return ReblockInstruction.parseInstruction(str);

      case Append:
        return AppendInstruction.parseInstruction(str);

      case Reorg:
        return ReorgInstruction.parseInstruction(str);

      case Replicate:
        return ReplicateInstruction.parseInstruction(str);

      case Unary:
        {
          String opcode = InstructionUtils.getOpCode(str);
          String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
          if (parts.length == 4
              && (opcode.equalsIgnoreCase("log") || opcode.equalsIgnoreCase("log_nz")))
            return ScalarInstruction.parseInstruction(str);
          else // default case
          return UnaryInstruction.parseInstruction(str);
        }
      case MMTSJ:
        return MMTSJMRInstruction.parseInstruction(str);

      case PMMJ:
        return PMMJMRInstruction.parseInstruction(str);

      case MapMultChain:
        return MapMultChainInstruction.parseInstruction(str);

      case BinUaggChain:
        return BinUaggChainInstruction.parseInstruction(str);

      case UaggOuterChain:
        return UaggOuterChainInstruction.parseInstruction(str);

      case CombineTernary:
        return CombineTernaryInstruction.parseInstruction(str);

      case CombineBinary:
        return CombineBinaryInstruction.parseInstruction(str);

      case CombineUnary:
        return CombineUnaryInstruction.parseInstruction(str);

      case PickByCount:
        return PickByCountInstruction.parseInstruction(str);

      case CM_N_COV:
        return CM_N_COVInstruction.parseInstruction(str);

      case GroupedAggregate:
        return GroupedAggregateInstruction.parseInstruction(str);

      case MapGroupedAggregate:
        return GroupedAggregateMInstruction.parseInstruction(str);

      case RangeReIndex:
        return RangeBasedReIndexInstruction.parseInstruction(str);

      case ZeroOut:
        return ZeroOutInstruction.parseInstruction(str);

      case MatrixReshape:
        return MatrixReshapeMRInstruction.parseInstruction(str);

      case Sort: // workaround for dummy MR sort instruction
        return SortMR.parseSortInstruction(str);

      case CSVReblock:
        return CSVReblockInstruction.parseInstruction(str);

      case CSVWrite:
        return CSVWriteInstruction.parseInstruction(str);

      case ParameterizedBuiltin:
        return ParameterizedBuiltinMRInstruction.parseInstruction(str);

      case RemoveEmpty:
        return RemoveEmptyMRInstruction.parseInstruction(str);

      case Partition:
        return DataPartitionMRInstruction.parseInstruction(str);

      case CumsumAggregate:
        return CumulativeAggregateInstruction.parseInstruction(str);

      case CumsumSplit:
        return CumulativeSplitInstruction.parseInstruction(str);

      case CumsumOffset:
        return CumulativeOffsetInstruction.parseInstruction(str);

      case INVALID:

      default:
        throw new DMLRuntimeException("Invalid MR Instruction Type: " + mrtype);
    }
  }
コード例 #2
0
  /**
   * Starts a Rand MapReduce job which will produce one or more random objects.
   *
   * @param numRows number of rows for each random object
   * @param numCols number of columns for each random object
   * @param blockRowSize number of rows in a block for each random object
   * @param blockColSize number of columns in a block for each random object
   * @param minValue minimum of the random values for each random object
   * @param maxValue maximum of the random values for each random object
   * @param sparsity sparsity for each random object
   * @param pdf probability density function for each random object
   * @param replication file replication
   * @param inputs input file for each random object
   * @param outputs output file for each random object
   * @param outputInfos output information for each random object
   * @param instructionsInMapper instruction for each random object
   * @param resultIndexes result indexes for each random object
   * @return matrix characteristics for each random object
   * @throws Exception if an error occurred in the MapReduce phase
   */
  public static JobReturn runJob(
      MRJobInstruction inst,
      String[] dataGenInstructions,
      String instructionsInMapper,
      String aggInstructionsInReducer,
      String otherInstructionsInReducer,
      int numReducers,
      int replication,
      byte[] resultIndexes,
      String dimsUnknownFilePrefix,
      String[] outputs,
      OutputInfo[] outputInfos)
      throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
      dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

      MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
      MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
      DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

      rlens[i] = genInst.getRows();
      clens[i] = genInst.getCols();
      brlens[i] = genInst.getRowsInBlock();
      bclens[i] = genInst.getColsInBlock();

      maxbrlen = Math.max(maxbrlen, brlens[i]);
      maxbclen = Math.max(maxbclen, bclens[i]);

      if (mrtype == MRINSTRUCTION_TYPE.Rand) {
        RandInstruction randInst = (RandInstruction) mrins;
        inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
        maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);

        // for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();

        // seed generation
        Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
        long[] nnz =
            LibMatrixDatagen.computeNNZperBlock(
                rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity());
        int nnzIx = 0;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
          for (long c = 0; c < clens[i]; c += bclens[i]) {
            long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

            sb.append((r / brlens[i]) + 1);
            sb.append(',');
            sb.append((c / bclens[i]) + 1);
            sb.append(',');
            sb.append(curBlockRowSize);
            sb.append(',');
            sb.append(curBlockColSize);
            sb.append(',');
            sb.append(nnz[nnzIx++]);
            sb.append(',');
            sb.append(bigrand.nextLong());
            pw.println(sb.toString());
            sb.setLength(0);
            numblocks++;
          }
        }
        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
        SeqInstruction seqInst = (SeqInstruction) mrins;
        inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
        maxsparsity = 1.0; // always dense

        double from = seqInst.fromValue;
        double to = seqInst.toValue;
        double incr = seqInst.incrValue;

        // handle default 1 to -1 for special case of from>to
        incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

        // Correctness checks on (from, to, incr)
        boolean neg = (from > to);
        if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

        if (neg != (incr < 0))
          throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

        // Compute the number of rows in the sequence
        long numrows = 1 + (long) Math.floor((to - from) / incr);
        if (rlens[i] > 0) {
          if (numrows != rlens[i])
            throw new DMLRuntimeException(
                "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                    + rlens[i]
                    + " != "
                    + numrows);
        } else {
          rlens[i] = numrows;
        }

        if (clens[i] > 0 && clens[i] != 1)
          throw new DMLRuntimeException(
              "Unexpected error while processing sequence instruction. Number of columns ("
                  + clens[i]
                  + ") must be equal to 1.");
        else clens[i] = 1;

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);
        StringBuilder sb = new StringBuilder();

        double temp = from;
        double block_from, block_to;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

          // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to]
          // (inclusive of both end points of the interval)
          long bid_i = ((r / brlens[i]) + 1);
          long bid_j = 1;
          block_from = temp;
          block_to = temp + (curBlockRowSize - 1) * incr;
          temp = block_to + incr; // next block starts from here

          sb.append(bid_i);
          sb.append(',');
          sb.append(bid_j);
          sb.append(',');
          /*
          // Need not include block size while generating seq()
          sb.append(curBlockRowSize);
          sb.append(',');
          sb.append(1);
          sb.append(',');*/
          sb.append(block_from);
          sb.append(',');
          sb.append(block_to);
          sb.append(',');
          sb.append(incr);

          pw.println(sb.toString());
          // System.out.println("MapTask " + r + ": " + sb.toString());
          sb.setLength(0);
          numblocks++;
        }

        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else {
        throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
      }
    }
    dataGenInsStr = dataGenInsStr.substring(1); // remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the input files and their format information
      MRJobConfiguration.setUpMultipleInputs(
          job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);

      // set up the dimensions of input matrices
      MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
      MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the rand Instructions
      MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

      // set up unary instructions that will perform in the mapper
      MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

      // set up the aggregate instructions that will happen in the combiner and reducer
      MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

      // set up the instructions that will happen in the reducer, after the aggregation instrucions
      MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

      // set up the replication factor for the results
      job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getDMLConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // set up custom map/reduce configurations
      MRJobConfiguration.setupCustomMRConfigurations(job, config);

      // determine degree of parallelism (nmappers: 1<=n<=capacity)
      // TODO use maxsparsity whenever we have a way of generating sparse rand data
      int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
      long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
      // correction max number of mappers on yarn clusters
      if (InfrastructureAnalyzer.isYarnEnabled())
        capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
      int nmapers =
          Math.max(
              Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity),
              1);
      job.setNumMapTasks(nmapers);

      // set up what matrices are needed to pass from the mapper to reducer
      HashSet<Byte> mapoutputIndexes =
          MRJobConfiguration.setUpOutputIndexesForMapper(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              otherInstructionsInReducer,
              resultIndexes);

      MatrixChar_N_ReducerGroups ret =
          MRJobConfiguration.computeMatrixCharacteristics(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              null,
              otherInstructionsInReducer,
              resultIndexes,
              mapoutputIndexes,
              false);
      stats = ret.stats;

      // set up the number of reducers
      MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

      // print the complete MRJob instruction
      if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats);

      // Update resultDimsUnknown based on computed "stats"
      byte[] resultDimsUnknown = new byte[resultIndexes.length];
      for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
          resultDimsUnknown[i] = (byte) 1;
        } else {
          resultDimsUnknown[i] = (byte) 0;
        }
      }

      boolean mayContainCtable =
          instructionsInMapper.contains("ctabletransform")
              || instructionsInMapper.contains("groupedagg");

      // set up the multiple output files, and their format information
      MRJobConfiguration.setUpMultipleOutputs(
          job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);

      // configure mapper and the mapper output key value pairs
      job.setMapperClass(DataGenMapper.class);
      if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
      } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
      }

      // set up combiner
      if (numReducers != 0
          && aggInstructionsInReducer != null
          && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class);

      // configure reducer
      job.setReducerClass(GMRReducer.class);
      // job.setReducerClass(PassThroughReducer.class);

      // By default, the job executes in "cluster" mode.
      // Determine if we can optimize and run it in "local" mode.
      MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
      for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
      }

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      runjob = JobClient.runJob(job);

      /* Process different counters */

      Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
      for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
      }

      String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
      stats = MapReduceTool.processDimsFiles(dir, stats);
      MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
      for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
  }