Пример #1
0
 @Override
 public void configure(JobConf conf) {
   this.threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD);
   int reducerID = conf.getInt("mapred.task.partition", -1);
   int max = conf.getInt(PARAM_APS_MAXKEY, 0);
   int nstripes = conf.getInt(PARAM_APS_STRIPES, 1);
   int spread = conf.getInt(PARAM_APS_REDUCER_PER_STRIPE, 1);
   if (reducerID < 0 || max == 0) {
     LOG.error("Could not find stripe ID, reverting to whole rest file loading");
     LOG.debug("reducer = " + reducerID + "\t max = " + max + "\t nstripes = " + nstripes);
     // open the pruned part file in the DistrubutedCache
     haspruned = FileUtils.readRestFile(conf, pruned);
   } else {
     int stripe = GenericKey.StripePartitioner.findStripe(reducerID, spread);
     int from = GenericKey.StripePartitioner.minKeyInStripe(stripe, nstripes, max);
     int to = from + GenericKey.StripePartitioner.numKeysInStripe(stripe, nstripes, max);
     // read from 'from' included, to 'to' excluded
     LOG.info(
         "Reducer "
             + reducerID
             + " loading stripe "
             + stripe
             + " of "
             + nstripes
             + " ("
             + from
             + ","
             + (to - 1)
             + ")");
     haspruned = FileUtils.readRestFile(conf, pruned, from, to);
   }
   if (!haspruned) LOG.warn("No pruned file provided in DistributedCache");
   else LOG.info("Read " + pruned.size() + " entries from pruned file");
 }
Пример #2
0
 /** Save the values out of the configuaration that we need to write the data. */
 @Override
 public void configure(JobConf job) {
   numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 20);
   minKeySize = job.getInt("test.randomwrite.min_key", 100);
   keySizeRange = job.getInt("test.randomwrite.max_key", 1000) - minKeySize;
   minValueSize = job.getInt("test.randomwrite.min_value", 0);
   valueSizeRange = job.getInt("test.randomwrite.max_value", 20000) - minValueSize;
 }
Пример #3
0
  @VisibleForTesting
  Fetcher(
      JobConf job,
      TaskAttemptID reduceId,
      ShuffleSchedulerImpl<K, V> scheduler,
      MergeManager<K, V> merger,
      Reporter reporter,
      ShuffleClientMetrics metrics,
      ExceptionReporter exceptionReporter,
      SecretKey shuffleKey,
      int id) {
    this.jobConf = job;
    this.reporter = reporter;
    this.scheduler = scheduler;
    this.merger = merger;
    this.metrics = metrics;
    this.exceptionReporter = exceptionReporter;
    this.id = id;
    this.reduce = reduceId.getTaskID().getId();
    this.shuffleSecretKey = shuffleKey;
    ioErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.IO_ERROR.toString());
    wrongLengthErrs =
        reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_LENGTH.toString());
    badIdErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.BAD_ID.toString());
    wrongMapErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_MAP.toString());
    connectionErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.CONNECTION.toString());
    wrongReduceErrs =
        reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_REDUCE.toString());

    this.connectionTimeout =
        job.getInt(MRJobConfig.SHUFFLE_CONNECT_TIMEOUT, DEFAULT_STALLED_COPY_TIMEOUT);
    this.readTimeout = job.getInt(MRJobConfig.SHUFFLE_READ_TIMEOUT, DEFAULT_READ_TIMEOUT);

    setName("fetcher#" + id);
    setDaemon(true);

    synchronized (Fetcher.class) {
      sslShuffle =
          job.getBoolean(MRConfig.SHUFFLE_SSL_ENABLED_KEY, MRConfig.SHUFFLE_SSL_ENABLED_DEFAULT);
      if (sslShuffle && sslFactory == null) {
        sslFactory = new SSLFactory(SSLFactory.Mode.CLIENT, job);
        try {
          sslFactory.init();
        } catch (Exception ex) {
          sslFactory.destroy();
          throw new RuntimeException(ex);
        }
      }
    }
  }
Пример #4
0
 /**
  * Generate the requested number of file splits, with the filename set to the filename of the
  * output file.
  */
 public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
   /** 设置输入分片的个数* */
   JobClient client = new JobClient(job);
   ClusterStatus cluster = client.getClusterStatus();
   /** 如果属性不存在 则返回默认的值 * */
   int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
   long numBytesToWritePerMap =
       job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
   if (numBytesToWritePerMap == 0) {
     System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
   }
   long totalBytesToWrite =
       job.getLong(
           "test.randomwrite.total_bytes",
           numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
   int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
   if (numMaps == 0 && totalBytesToWrite > 0) {
     numMaps = 1;
   }
   System.out.println("numMaps-------" + numMaps);
   InputSplit[] result = new InputSplit[numMaps];
   Path outDir = FileOutputFormat.getOutputPath(job);
   for (int i = 0; i < result.length; ++i) {
     result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null);
   }
   return result;
 }
Пример #5
0
  /**
   * This is the main routine for launching a distributed random write job. It runs 10 maps/node and
   * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything.
   *
   * @throws IOException
   */
  public int run(String[] args) throws Exception {
    if (args.length == 0) {
      System.out.println("Usage: writer <out-dir>");
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    Path outDir = new Path(args[0]);
    JobConf job = new JobConf(getConf());

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);

    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(IdentityReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient client = new JobClient(job);
    ClusterStatus cluster = client.getClusterStatus();
    /** 如果属性不存在 则返回默认的值 * */
    int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
    long numBytesToWritePerMap =
        job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
      System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
      return -2;
    }
    long totalBytesToWrite =
        job.getLong(
            "test.randomwrite.total_bytes",
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
      numMaps = 1;
      job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
    }

    job.setNumMapTasks(numMaps);
    /** 建议型的 * */
    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println(
        "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return 0;
  }
    /**
     * passing data and popsize so that they would be available before a call to the mapper
     *
     * @param job this mapreduce job
     */
    @Override
    public void configure(JobConf job) {

      popsize = job.getInt("popsize", popsize);
      data = job.get("dataset", data);

      new Population(data, popsize);
    }
    @Override
    public void configure(JobConf job) {

      numberOfNodes = job.getInt("numberOfNodes", 0);
      if (numberOfNodes == 0) {
        System.exit(0);
      }
    }
Пример #8
0
 public void configure(JobConf job) {
   this.jobConf = job;
   urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
   interval = jobConf.getInt("db.fetch.interval.default", 2592000);
   filters = new URLFilters(jobConf);
   scfilters = new ScoringFilters(jobConf);
   scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
   curTime = job.getLong("injector.current.time", System.currentTimeMillis());
 }
Пример #9
0
 /**
  * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size
  * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at
  * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster).
  *
  * @param totalBytes Count of total bytes for job
  * @param job The job to configure
  * @return Count of maps to run.
  */
 private static void setMapCount(long totalBytes, JobConf job) throws IOException {
   int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP));
   numMaps =
       Math.min(
           numMaps,
           job.getInt(
               MAX_MAPS_LABEL,
               MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers()));
   job.setNumMapTasks(Math.max(numMaps, 1));
 }
  public void configure(JobConf job) {

    _jobConf = job;

    crawlerCount =
        job.getInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, CrawlEnvironment.CRAWLERS.length);

    partitionNumber = job.getInt("mapred.task.partition", -1);

    try {
      FileSystem fs = FileSystem.get(job);
      Path workPath = FileOutputFormat.getOutputPath(job);
      debugURLStream =
          fs.create(new Path(workPath, "debugURLS-" + NUMBER_FORMAT.format(partitionNumber)));
      urlDebugURLWriter = new OutputStreamWriter(debugURLStream, Charset.forName("UTF-8"));
      _emittedURLSFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    } catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw new RuntimeException(e);
    }
  }
Пример #11
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
      long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
      String srcfilelist = job.get(SRC_LIST_LABEL, "");
      if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
        throw new RuntimeException(
            "Invalid metadata: #files("
                + cnfiles
                + ") total_size("
                + cbsize
                + ") listuri("
                + srcfilelist
                + ")");
      }
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(job);
      FileStatus srcst = fs.getFileStatus(src);

      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      FilePair value = new FilePair();
      final long targetsize = cbsize / numSplits;
      long pos = 0L;
      long last = 0L;
      long acc = 0L;
      long cbrem = srcst.getLen();
      SequenceFile.Reader sl = null;
      try {
        sl = new SequenceFile.Reader(fs, src, job);
        for (; sl.next(key, value); last = sl.getPosition()) {
          // if adding this split would put this split past the target size,
          // cut the last split and put this next file in the next split.
          if (acc + key.get() > targetsize && acc != 0) {
            long splitsize = last - pos;
            splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
            cbrem -= splitsize;
            pos = last;
            acc = 0L;
          }
          acc += key.get();
        }
      } finally {
        checkAndClose(sl);
      }
      if (cbrem != 0) {
        splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
      }

      return splits.toArray(new FileSplit[splits.size()]);
    }
  public JobBuilder setAffinityNoBalancing(Path affinityPath, Set<String> exclusionSet)
      throws IOException {
    // set node affinity ...
    String affinityMask =
        NodeAffinityMaskBuilder.buildNodeAffinityMask(
            FileSystem.get(_jobConf),
            affinityPath,
            null,
            exclusionSet,
            _jobConf.getInt("mapred.tasktracker.reduce.tasks.maximum", -1),
            true);

    NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask);

    return this;
  }
Пример #13
0
 /**
  * Mapper configuration. Extracts source and destination file system, as well as top-level paths
  * on source and destination directories. Gets the named file systems, to be used later in map.
  */
 public void configure(JobConf job) {
   destPath = new Path(job.get(DST_DIR_LABEL, "/"));
   try {
     destFileSys = destPath.getFileSystem(job);
   } catch (IOException ex) {
     throw new RuntimeException("Unable to get the named file system.", ex);
   }
   sizeBuf = job.getInt("copy.buf.size", 128 * 1024);
   buffer = new byte[sizeBuf];
   ignoreReadFailures = job.getBoolean(Options.IGNORE_READ_FAILURES.propertyname, false);
   preserve_status = job.getBoolean(Options.PRESERVE_STATUS.propertyname, false);
   if (preserve_status) {
     preseved = FileAttribute.parse(job.get(PRESERVE_STATUS_LABEL));
   }
   update = job.getBoolean(Options.UPDATE.propertyname, false);
   overwrite = !update && job.getBoolean(Options.OVERWRITE.propertyname, false);
   this.job = job;
 }
Пример #14
0
  @Override
  public void configure(JobConf job) {
    super.configure(job);

    this.totalBytes = 0;
    this.chunkId =
        job.getInt(
            "mapred.task.partition",
            -1); // http://www.mail-archive.com/[email protected]/msg05749.html

    if (this.chunkId < 0) {
      throw new RuntimeException("Incorrect chunk id ");
    }
    this.nodeId = this.chunkId / getNumChunks();

    this.client = new AdminClient(getCluster(), new AdminClientConfig());
    logger.info("Reducer for Node id - " + this.nodeId + " and chunk id - " + this.chunkId);
  }
Пример #15
0
      public void configure(JobConf job) {
        // 'key' == sortInput for sort-input; key == sortOutput for sort-output
        key = deduceInputFile(job);

        if (key == sortOutput) {
          partitioner = new HashPartitioner<WritableComparable, Writable>();

          // Figure the 'current' partition and no. of reduces of the 'sort'
          try {
            URI inputURI = new URI(job.get("map.input.file"));
            String inputFile = inputURI.getPath();
            partition =
                Integer.valueOf(inputFile.substring(inputFile.lastIndexOf("part") + 5)).intValue();
            noSortReducers = job.getInt("sortvalidate.sort.reduce.tasks", -1);
          } catch (Exception e) {
            System.err.println("Caught: " + e);
            System.exit(-1);
          }
        }
      }
Пример #16
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      final int srcCount = job.getInt(OP_COUNT_LABEL, -1);
      final int targetcount = srcCount / numSplits;
      String srclist = job.get(OP_LIST_LABEL, "");
      if (srcCount < 0 || "".equals(srclist)) {
        throw new RuntimeException(
            "Invalid metadata: #files(" + srcCount + ") listuri(" + srclist + ")");
      }
      Path srcs = new Path(srclist);
      FileSystem fs = srcs.getFileSystem(job);

      List<FileSplit> splits = new ArrayList<FileSplit>(numSplits);

      Text key = new Text();
      PolicyInfo value = new PolicyInfo();
      SequenceFile.Reader in = null;
      long prev = 0L;
      int count = 0; // count src
      try {
        for (in = new SequenceFile.Reader(fs, srcs, job); in.next(key, value); ) {
          long curr = in.getPosition();
          long delta = curr - prev;
          if (++count > targetcount) {
            count = 0;
            splits.add(new FileSplit(srcs, prev, delta, (String[]) null));
            prev = curr;
          }
        }
      } finally {
        in.close();
      }
      long remaining = fs.getFileStatus(srcs).getLen() - prev;
      if (remaining != 0) {
        splits.add(new FileSplit(srcs, prev, remaining, (String[]) null));
      }
      LOG.info(
          "jobname= " + jobName + " numSplits=" + numSplits + ", splits.size()=" + splits.size());
      return splits.toArray(new FileSplit[splits.size()]);
    }
Пример #17
0
    public void configure(JobConf conf) {
      try {
        Path vInput;
        FileSystem fs;
        URI[] fvector;
        nsize = conf.getInt("DIMENTION", 0);
        sumVec = new double[nsize];
        resVec = new double[nsize];
        diaVec = new double[nsize];
        Arrays.fill(sumVec, 0);
        Arrays.fill(resVec, 0);
        Arrays.fill(diaVec, 0);

        fvector = DistributedCache.getCacheFiles(conf);
        vInput = new Path(fvector[0].getPath());
        fs = FileSystem.get(URI.create("hdfs://node17.cs.rochester.edu:9000"), conf);

        FSDataInputStream fdis = fs.open(vInput);

        String line;
        while ((line = fdis.readLine()) != null) {
          StringTokenizer tokenizer = new StringTokenizer(line);
          int rowIdx = Integer.parseInt(tokenizer.nextToken());
          int colIdx = Integer.parseInt(tokenizer.nextToken());
          double matVar = Double.parseDouble(tokenizer.nextToken());
          if (rowIdx == colIdx) {
            diaVec[rowIdx] = matVar;
          } else if (colIdx == nsize) {
            resVec[rowIdx] = matVar;
          } else {
            sumVec[rowIdx] += matVar;
          }
        }
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  public void configure(JobConf job) {

    _attemptID = TaskAttemptID.forName(job.get("mapred.task.id"));
    _maxAttemptsPerTask = job.getInt("mapred.max.tracker.failures", 4);
    _splitDetails = job.get(ARCSplitReader.SPLIT_DETAILS, "Spit Details Unknown");
  }
Пример #19
0
 public void configure(JobConf job) {
   nSamples = job.getInt("nSamples", 100);
   ONE.set(1);
 }
 public void configure(JobConf job) {
   mNodeCnt = job.getInt("NodeCount", 0);
 }
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      System.err.println("Usage: " + getClass().getName() + " <input> <output> <nPopulation>");
      // ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    // Create a JobConf using the processed <code>conf</code>
    final JobConf jobConf = new JobConf(getConf(), getClass());

    // Specify various job-specific parameters
    jobConf.setJobName(MapreduceStringFinder.class.getSimpleName());

    // setting the input format
    jobConf.setInputFormat(Individuals.class);

    // setting the output ke and value class
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BooleanWritable.class);

    // setting the mapper class
    jobConf.setMapperClass(CIMapper.class);
    jobConf.setNumMapTasks(3); // setting number of maptasks

    // setting the reducer class
    jobConf.setReducerClass(CIReducer.class);

    // setup input/output directories
    final String dataset = args[0];

    FileInputFormat.setInputPaths(jobConf, new Path(dataset));
    FileOutputFormat.setOutputPath(jobConf, new Path(args[1]));
    final int pop = Integer.parseInt(args[2]);

    // based on the configuration, make this job threadable
    if (jobConf.getInt("mapred.tasktracker.map.tasks.maximum", 2) == 1) {
      jobConf.setMapRunnerClass(MultithreadedMapRunner.class);
      jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    }
    jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    // for computation intensive data, do not allow the job to fail if the task tracker does not
    // respond
    // with a heatbeat message before the timeout value
    final int timeout = 9000000;
    jobConf.setInt("mapred.task.timeout", timeout);

    // set the parameters to be available before a call to the mapper
    jobConf.setInt("popsize", pop);
    jobConf.setStrings("dataset", dataset);

    // int map = jobConf.getNumMapTasks();
    // System.out.println("Number of Maps"+ map);

    // start the  map/reduce job
    System.out.println("Starting Job");

    // get the start time for this job
    final long startTime = System.currentTimeMillis();

    // Submit the job, then poll for progress until the job is complete
    JobClient.runJob(jobConf);

    // get the end time for this job
    final long endTime = System.currentTimeMillis();

    // get the duration of this job
    final double duration = (endTime - startTime) / 1000.0;
    // System.out.println("Job Finished in " + duration + " seconds");
    // getElapsedTime(startTime - endTime);

    return 0;
  }
Пример #22
0
    public void configure(JobConf conf) {

      delim = conf.get("delim", ",");
      keyFieldIndexTwo = conf.getInt("keyFieldIndexTwo", 0);
    }
Пример #23
0
  /** Execute a query plan using Hadoop. */
  @SuppressWarnings({"deprecation", "unchecked"})
  @Override
  public int execute(DriverContext driverContext) {

    IOPrepareCache ioPrepareCache = IOPrepareCache.get();
    ioPrepareCache.clear();

    boolean success = true;

    Context ctx = driverContext.getCtx();
    boolean ctxCreated = false;
    Path emptyScratchDir;

    MapWork mWork = work.getMapWork();
    ReduceWork rWork = work.getReduceWork();

    try {
      if (ctx == null) {
        ctx = new Context(job);
        ctxCreated = true;
      }

      emptyScratchDir = ctx.getMRTmpPath();
      FileSystem fs = emptyScratchDir.getFileSystem(job);
      fs.mkdirs(emptyScratchDir);
    } catch (IOException e) {
      e.printStackTrace();
      console.printError(
          "Error launching map-reduce job",
          "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
      return 5;
    }

    HiveFileFormatUtils.prepareJobOutput(job);
    // See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput()
    job.setOutputFormat(HiveOutputFormatImpl.class);

    job.setMapperClass(ExecMapper.class);

    job.setMapOutputKeyClass(HiveKey.class);
    job.setMapOutputValueClass(BytesWritable.class);

    try {
      String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER);
      job.setPartitionerClass(JavaUtils.loadClass(partitioner));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage(), e);
    }

    if (mWork.getNumMapTasks() != null) {
      job.setNumMapTasks(mWork.getNumMapTasks().intValue());
    }

    if (mWork.getMaxSplitSize() != null) {
      HiveConf.setLongVar(
          job, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mWork.getMaxSplitSize().longValue());
    }

    if (mWork.getMinSplitSize() != null) {
      HiveConf.setLongVar(
          job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mWork.getMinSplitSize().longValue());
    }

    if (mWork.getMinSplitSizePerNode() != null) {
      HiveConf.setLongVar(
          job,
          HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE,
          mWork.getMinSplitSizePerNode().longValue());
    }

    if (mWork.getMinSplitSizePerRack() != null) {
      HiveConf.setLongVar(
          job,
          HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK,
          mWork.getMinSplitSizePerRack().longValue());
    }

    job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
    job.setReducerClass(ExecReducer.class);

    // set input format information if necessary
    setInputAttributes(job);

    // Turn on speculative execution for reducers
    boolean useSpeculativeExecReducers =
        HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
    HiveConf.setBoolVar(
        job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS, useSpeculativeExecReducers);

    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);

    if (mWork.isUseBucketizedHiveInputFormat()) {
      inpFormat = BucketizedHiveInputFormat.class.getName();
    }

    LOG.info("Using " + inpFormat);

    try {
      job.setInputFormat(JavaUtils.loadClass(inpFormat));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage(), e);
    }

    // No-Op - we don't really write anything here ..
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands
    // it
    String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS);
    String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS);
    if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) {
      String allJars =
          StringUtils.isNotBlank(auxJars)
              ? (StringUtils.isNotBlank(addedJars) ? addedJars + "," + auxJars : auxJars)
              : addedJars;
      LOG.info("adding libjars: " + allJars);
      initializeFiles("tmpjars", allJars);
    }

    // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it
    String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES);
    if (StringUtils.isNotBlank(addedFiles)) {
      initializeFiles("tmpfiles", addedFiles);
    }
    int returnVal = 0;
    boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME));

    if (noName) {
      // This is for a special case to ensure unit tests pass
      HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt());
    }
    String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES);
    // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it
    if (StringUtils.isNotBlank(addedArchives)) {
      initializeFiles("tmparchives", addedArchives);
    }

    try {
      MapredLocalWork localwork = mWork.getMapRedLocalWork();
      if (localwork != null && localwork.hasStagedAlias()) {
        if (!ShimLoader.getHadoopShims().isLocalMode(job)) {
          Path localPath = localwork.getTmpPath();
          Path hdfsPath = mWork.getTmpHDFSPath();

          FileSystem hdfs = hdfsPath.getFileSystem(job);
          FileSystem localFS = localPath.getFileSystem(job);
          FileStatus[] hashtableFiles = localFS.listStatus(localPath);
          int fileNumber = hashtableFiles.length;
          String[] fileNames = new String[fileNumber];

          for (int i = 0; i < fileNumber; i++) {
            fileNames[i] = hashtableFiles[i].getPath().getName();
          }

          // package and compress all the hashtable files to an archive file
          String stageId = this.getId();
          String archiveFileName = Utilities.generateTarFileName(stageId);
          localwork.setStageID(stageId);

          CompressionUtils.tar(localPath.toUri().getPath(), fileNames, archiveFileName);
          Path archivePath = Utilities.generateTarPath(localPath, stageId);
          LOG.info("Archive " + hashtableFiles.length + " hash table files to " + archivePath);

          // upload archive file to hdfs
          Path hdfsFilePath = Utilities.generateTarPath(hdfsPath, stageId);
          short replication = (short) job.getInt("mapred.submit.replication", 10);
          hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
          hdfs.setReplication(hdfsFilePath, replication);
          LOG.info("Upload 1 archive file  from" + archivePath + " to: " + hdfsFilePath);

          // add the archive file to distributed cache
          DistributedCache.createSymlink(job);
          DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
          LOG.info(
              "Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
        }
      }
      work.configureJobConf(job);
      List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false);
      Utilities.setInputPaths(job, inputPaths);

      Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());

      if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) {
        try {
          handleSampling(ctx, mWork, job);
          job.setPartitionerClass(HiveTotalOrderPartitioner.class);
        } catch (IllegalStateException e) {
          console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        } catch (Exception e) {
          LOG.error("Sampling error", e);
          console.printError(
              e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        }
      }

      // remove the pwd from conf file so that job tracker doesn't show this
      // logs
      String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD);
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE");
      }
      JobClient jc = new JobClient(job);
      // make this client wait if job tracker is not behaving well.
      Throttle.checkJobTracker(job, LOG);

      if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
        // initialize stats publishing table
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(job);
        if (factory != null) {
          statsPublisher = factory.getStatsPublisher();
          List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job);
          if (rWork != null) {
            statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job));
          }
          StatsCollectionContext sc = new StatsCollectionContext(job);
          sc.setStatsTmpDirs(statsTmpDir);
          if (!statsPublisher.init(sc)) { // creating stats table if not exists
            if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
              throw new HiveException(
                  ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
            }
          }
        }
      }

      Utilities.createTmpDirs(job, mWork);
      Utilities.createTmpDirs(job, rWork);

      SessionState ss = SessionState.get();
      if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")
          && ss != null) {
        TezSessionState session = ss.getTezSession();
        TezSessionPoolManager.getInstance().close(session, true);
      }

      // Finally SUBMIT the JOB!
      rj = jc.submitJob(job);
      // replace it back
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd);
      }

      returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager());
      success = (returnVal == 0);
    } catch (Exception e) {
      e.printStackTrace();
      String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
      if (rj != null) {
        mesg = "Ended Job = " + rj.getJobID() + mesg;
      } else {
        mesg = "Job Submission failed" + mesg;
      }

      // Has to use full name to make sure it does not conflict with
      // org.apache.commons.lang.StringUtils
      console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));

      success = false;
      returnVal = 1;
    } finally {
      Utilities.clearWork(job);
      try {
        if (ctxCreated) {
          ctx.clear();
        }

        if (rj != null) {
          if (returnVal != 0) {
            rj.killJob();
          }
          jobID = rj.getID().toString();
        }
      } catch (Exception e) {
        LOG.warn("Failed while cleaning up ", e);
      } finally {
        HadoopJobExecHelper.runningJobs.remove(rj);
      }
    }

    // get the list of Dynamic partition paths
    try {
      if (rj != null) {
        if (mWork.getAliasToWork() != null) {
          for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) {
            op.jobClose(job, success);
          }
        }
        if (rWork != null) {
          rWork.getReducer().jobClose(job, success);
        }
      }
    } catch (Exception e) {
      // jobClose needs to execute successfully otherwise fail task
      if (success) {
        success = false;
        returnVal = 3;
        String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'";
        console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
      }
    }

    return (returnVal);
  }
 public void configure(JobConf job) {
   int n = job.getInt("NodeCnt", 0);
   node.setType(PageRankNode.TYPE_COMPLETE);
   node.setPageRank((float) -StrictMath.log(n));
 }
Пример #25
0
 static int getMaxConcurrentSteps(JobConf jobConf) {
   return jobConf.getInt(MAX_CONCURRENT_STEPS, 0);
 }
Пример #26
0
  public void configure(JobConf job) {
    memoryMXBean = ManagementFactory.getMemoryMXBean();
    l4j.info("maximum memory = " + memoryMXBean.getHeapMemoryUsage().getMax());

    try {
      l4j.info(
          "conf classpath = " + Arrays.asList(((URLClassLoader) job.getClassLoader()).getURLs()));
      l4j.info(
          "thread classpath = "
              + Arrays.asList(
                  ((URLClassLoader) Thread.currentThread().getContextClassLoader()).getURLs()));
    } catch (Exception e) {
      l4j.info("cannot get classpath: " + e.getMessage());
    }
    try {
      jc = job;
      int estCountBucketSize = jc.getInt("hive.exec.estdistinct.bucketsize.log", 15);
      int estCountBufferSize = jc.getInt("hive.exec.estdistinct.buffsize.log", 8);
      GenericUDAFCardinalityEstimation.initParams(estCountBucketSize, estCountBufferSize);

      boolean isChangSizeZero2Null =
          jc.getBoolean(
              HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_ZERO_SIZE_2_NULL.varname,
              HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_ZERO_SIZE_2_NULL.defaultBoolVal);
      boolean isChangeNull2Null =
          jc.getBoolean(
              HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_NULL_2_NULL.varname,
              HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_NULL_2_NULL.defaultBoolVal);
      GenericUDTFExplode.isChangSizeZero2Null = isChangSizeZero2Null;
      GenericUDTFExplode.isChangNull2Null = isChangeNull2Null;
      GenericUDTFPosExplode.isChangSizeZero2Null = isChangSizeZero2Null;
      GenericUDTFPosExplode.isChangNull2Null = isChangeNull2Null;

      mapredWork mrwork = Utilities.getMapRedWork(job);
      mo = new MapOperator();
      mo.setConf(mrwork);
      mo.setChildren(job);
      l4j.info(mo.dump(0));
      mo.initialize(jc, null);

      localWork = mrwork.getMapLocalWork();
      if (localWork == null) {
        return;
      }
      fetchOperators = new HashMap<String, FetchOperator>();
      for (Map.Entry<String, fetchWork> entry : localWork.getAliasToFetchWork().entrySet()) {
        fetchOperators.put(entry.getKey(), new FetchOperator(entry.getValue(), job));
        l4j.info("fetchoperator for " + entry.getKey() + " created");
      }
      for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
        Operator<? extends Serializable> forwardOp = localWork.getAliasToWork().get(entry.getKey());
        forwardOp.initialize(
            jc, new ObjectInspector[] {entry.getValue().getOutputObjectInspector()});
        l4j.info("fetchoperator for " + entry.getKey() + " initialized");
      }
    } catch (Throwable e) {
      abort = true;
      if (e instanceof OutOfMemoryError) {
        throw (OutOfMemoryError) e;
      } else {
        throw new RuntimeException("Map operator initialization failed", e);
      }
    }
  }
Пример #27
0
  public MergeManagerImpl(
      TaskAttemptID reduceId,
      JobConf jobConf,
      FileSystem localFS,
      LocalDirAllocator localDirAllocator,
      Reporter reporter,
      CompressionCodec codec,
      Class<? extends Reducer> combinerClass,
      CombineOutputCollector<K, V> combineCollector,
      Counters.Counter spilledRecordsCounter,
      Counters.Counter reduceCombineInputCounter,
      Counters.Counter mergedMapOutputsCounter,
      ExceptionReporter exceptionReporter,
      Progress mergePhase,
      MapOutputFile mapOutputFile) {
    this.reduceId = reduceId;
    this.jobConf = jobConf;
    this.localDirAllocator = localDirAllocator;
    this.exceptionReporter = exceptionReporter;

    this.reporter = reporter;
    this.codec = codec;
    this.combinerClass = combinerClass;
    this.combineCollector = combineCollector;
    this.reduceCombineInputCounter = reduceCombineInputCounter;
    this.spilledRecordsCounter = spilledRecordsCounter;
    this.mergedMapOutputsCounter = mergedMapOutputsCounter;
    this.mapOutputFile = mapOutputFile;
    this.mapOutputFile.setConf(jobConf);

    this.localFS = localFS;
    this.rfs = ((LocalFileSystem) localFS).getRaw();

    final float maxInMemCopyUse = jobConf.getFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.90f);
    if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) {
      throw new IllegalArgumentException(
          "Invalid value for " + MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT + ": " + maxInMemCopyUse);
    }

    // Allow unit tests to fix Runtime memory
    this.memoryLimit =
        (long)
            (jobConf.getLong(
                    MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES,
                    Math.min(Runtime.getRuntime().maxMemory(), Integer.MAX_VALUE))
                * maxInMemCopyUse);

    this.ioSortFactor = jobConf.getInt(MRJobConfig.IO_SORT_FACTOR, 100);

    final float singleShuffleMemoryLimitPercent =
        jobConf.getFloat(
            MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, DEFAULT_SHUFFLE_MEMORY_LIMIT_PERCENT);
    if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) {
      throw new IllegalArgumentException(
          "Invalid value for "
              + MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT
              + ": "
              + singleShuffleMemoryLimitPercent);
    }

    usedMemory = 0L;
    commitMemory = 0L;
    this.maxSingleShuffleLimit = (long) (memoryLimit * singleShuffleMemoryLimitPercent);
    this.memToMemMergeOutputsThreshold =
        jobConf.getInt(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, ioSortFactor);
    this.mergeThreshold =
        (long) (this.memoryLimit * jobConf.getFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.90f));
    LOG.info(
        "MergerManager: memoryLimit="
            + memoryLimit
            + ", "
            + "maxSingleShuffleLimit="
            + maxSingleShuffleLimit
            + ", "
            + "mergeThreshold="
            + mergeThreshold
            + ", "
            + "ioSortFactor="
            + ioSortFactor
            + ", "
            + "memToMemMergeOutputsThreshold="
            + memToMemMergeOutputsThreshold);

    if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
      throw new RuntimeException(
          "Invlaid configuration: "
              + "maxSingleShuffleLimit should be less than mergeThreshold"
              + "maxSingleShuffleLimit: "
              + this.maxSingleShuffleLimit
              + "mergeThreshold: "
              + this.mergeThreshold);
    }

    boolean allowMemToMemMerge = jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
    if (allowMemToMemMerge) {
      this.memToMemMerger =
          new IntermediateMemoryToMemoryMerger(this, memToMemMergeOutputsThreshold);
      this.memToMemMerger.start();
    } else {
      this.memToMemMerger = null;
    }

    this.inMemoryMerger = createInMemoryMerger();
    this.inMemoryMerger.start();

    this.onDiskMerger = new OnDiskMerger(this);
    this.onDiskMerger.start();

    this.mergePhase = mergePhase;
  }
Пример #28
0
 @Override
 public void configure(JobConf conf) {
   nstripes = conf.getInt(PARAM_APS_STRIPES, 1);
 }
 public void configure(JobConf job) {
   // get buckets per crawler ...
   bucketsPerCrawler = job.getInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, 8);
 }
Пример #30
0
    /**
     * Copy a file to a destination.
     *
     * @param srcstat src path and metadata
     * @param dstpath dst path
     * @param reporter
     */
    private void copy(
        FileStatus srcstat,
        Path relativedst,
        OutputCollector<WritableComparable<?>, Text> outc,
        Reporter reporter)
        throws IOException {
      Path absdst = new Path(destPath, relativedst);
      int totfiles = job.getInt(SRC_COUNT_LABEL, -1);
      assert totfiles >= 0 : "Invalid file count " + totfiles;

      // if a directory, ensure created even if empty
      if (srcstat.isDir()) {
        if (destFileSys.exists(absdst)) {
          if (!destFileSys.getFileStatus(absdst).isDir()) {
            throw new IOException("Failed to mkdirs: " + absdst + " is a file.");
          }
        } else if (!destFileSys.mkdirs(absdst)) {
          throw new IOException("Failed to mkdirs " + absdst);
        }
        // TODO: when modification times can be set, directories should be
        // emitted to reducers so they might be preserved. Also, mkdirs does
        // not currently return an error when the directory already exists;
        // if this changes, all directory work might as well be done in reduce
        return;
      }

      if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) {
        outc.collect(null, new Text("SKIP: " + srcstat.getPath()));
        ++skipcount;
        reporter.incrCounter(Counter.SKIP, 1);
        updateStatus(reporter);
        return;
      }

      Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst);
      long cbcopied = 0L;
      FSDataInputStream in = null;
      FSDataOutputStream out = null;
      try {
        // open src file
        try {
          in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath());
        } catch (IOException e) {
          LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return");
          in = null;
          return;
        }
        reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen());
        // open tmp file
        out = create(tmpfile, reporter, srcstat);
        // copy file
        for (int cbread; (cbread = in.read(buffer)) >= 0; ) {
          out.write(buffer, 0, cbread);
          cbcopied += cbread;
          reporter.setStatus(
              String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen())
                  + absdst
                  + " [ "
                  + StringUtils.humanReadableInt(cbcopied)
                  + " / "
                  + StringUtils.humanReadableInt(srcstat.getLen())
                  + " ]");
        }
      } finally {
        checkAndClose(in);
        checkAndClose(out);
      }

      if (cbcopied != srcstat.getLen()) {
        if (srcstat.getLen() == 0 && cbcopied > 0) {
          LOG.info("most likely see a WAL file corruption: " + srcstat.getPath());
        } else {
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(cbcopied)
                  + " to tmpfile (="
                  + tmpfile
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
      } else {
        if (totfiles == 1) {
          // Copying a single file; use dst path provided by user as destination
          // rather than destination directory, if a file
          Path dstparent = absdst.getParent();
          if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) {
            absdst = dstparent;
          }
        }
        if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) {
          throw new IOException(absdst + " is a directory");
        }
        if (!destFileSys.mkdirs(absdst.getParent())) {
          throw new IOException("Failed to craete parent dir: " + absdst.getParent());
        }
        rename(tmpfile, absdst);

        FileStatus dststat = destFileSys.getFileStatus(absdst);
        if (dststat.getLen() != srcstat.getLen()) {
          destFileSys.delete(absdst, false);
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(dststat.getLen())
                  + " to dst (="
                  + absdst
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
        updatePermissions(srcstat, dststat);
      }

      // report at least once for each file
      ++copycount;
      reporter.incrCounter(Counter.BYTESCOPIED, cbcopied);
      reporter.incrCounter(Counter.COPY, 1);
      updateStatus(reporter);
    }