public void configure(JobConf job) {
   bytesToWrite = job.getLong(RandomTextWriter.BYTES_PER_MAP, 1 * 1024 * 1024 * 1024);
   keymin = job.getInt(RandomTextWriter.MIN_KEY, 5);
   keymax = job.getInt(RandomTextWriter.MAX_KEY, 10);
   valmin = job.getInt(RandomTextWriter.MIN_VALUE, 5);
   valmax = job.getInt(RandomTextWriter.MAX_VALUE, 10);
 }
  public void configure(JobConf conf) {
    numberOfCenters = Integer.valueOf(conf.get("numberOfCenters"));
    centersDirectory = conf.get("centersReadDirectory");

    try {
      Configuration c = new Configuration();
      FileSystem fs = FileSystem.get(c);

      for (int index = 0; index < numberOfCenters; ++index) {
        SequenceFile.Reader reader =
            new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c);

        LongWritable key = new LongWritable();
        Point value = new Point();

        reader.next(key, value);

        Point center = (Point) value;

        centers.add(center);

        reader.close();
      }
    } catch (IOException e) {
      // do nothing
      // I hope this doesn't happen
      System.out.println("well, damn.");
      e.printStackTrace();
    }
  }
  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    // vv MaxTemperatureDriverV6
    conf.setProfileEnabled(true);
    conf.setProfileParams(
        "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s");
    conf.setProfileTaskRange(true, "0-2");
    // ^^ MaxTemperatureDriverV6

    JobClient.runJob(conf);
    return 0;
  }
  private void testMapFileOutputCommitterInternal(int version) throws Exception {
    JobConf conf = new JobConf();
    FileOutputFormat.setOutputPath(conf, outDir);
    conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
    conf.setInt(
        org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
            .FILEOUTPUTCOMMITTER_ALGORITHM_VERSION,
        version);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());
    TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter();

    // setup
    committer.setupJob(jContext);
    committer.setupTask(tContext);

    // write output
    MapFileOutputFormat theOutputFormat = new MapFileOutputFormat();
    RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(null, conf, partFile, null);
    writeMapFileOutput(theRecordWriter, tContext);

    // do commit
    if (committer.needsTaskCommit(tContext)) {
      committer.commitTask(tContext);
    }
    committer.commitJob(jContext);

    // validate output
    validateMapFileOutputContent(FileSystem.get(conf), outDir);
    FileUtil.fullyDelete(new File(outDir.toString()));
  }
Example #5
0
 /** Set the max number of attempts before we declare a TIP as "failed" */
 private void setMaxTaskAttempts() {
   if (isMapTask()) {
     this.maxTaskAttempts = conf.getMaxMapAttempts();
   } else {
     this.maxTaskAttempts = conf.getMaxReduceAttempts();
   }
 }
Example #6
0
  /**
   * Start simulated task trackers based on topology.
   *
   * @param clusterStory the cluster topology.
   * @param jobConf configuration object.
   * @param now time stamp when the simulator is started, {@link SimulatorTaskTracker}s are started
   *     uniformly randomly spread in [now,now+startDuration).
   * @return time stamp by which the entire cluster is booted up and all task trackers are sending
   *     hearbeats in their steady rate.
   */
  long startTaskTrackers(ClusterStory cluster, JobConf jobConf, long now) {
    /** port assigned to TTs, incremented by 1 for each TT */
    int port = 10000;
    int numTaskTrackers = 0;

    Random random =
        new Random(RandomSeedGenerator.getSeed("forStartTaskTrackers()", masterRandomSeed));

    final int startDuration =
        jobConf.getInt("mumak.cluster.startup.duration", DEFAULT_CLUSTER_STARTUP_DURATION);

    for (MachineNode node : cluster.getMachines()) {
      jobConf.set("mumak.tasktracker.host.name", node.getName());
      jobConf.set(
          "mumak.tasktracker.tracker.name",
          "tracker_" + node.getName() + ":localhost/127.0.0.1:" + port);
      long subRandomSeed =
          RandomSeedGenerator.getSeed("forTaskTracker" + numTaskTrackers, masterRandomSeed);
      jobConf.setLong("mumak.tasktracker.random.seed", subRandomSeed);
      numTaskTrackers++;
      port++;
      SimulatorTaskTracker tt = new SimulatorTaskTracker(jt, jobConf);
      long firstHeartbeat = now + random.nextInt(startDuration);
      queue.addAll(tt.init(firstHeartbeat));
    }

    // In startDuration + heartbeat interval of the full cluster time each
    // TT is started up and told on its 2nd heartbeat to beat at a rate
    // corresponding to the steady state of the cluster
    long clusterSteady = now + startDuration + jt.getNextHeartbeatInterval();
    return clusterSteady;
  }
  private void testAbortInternal(int version) throws IOException, InterruptedException {
    JobConf conf = new JobConf();
    FileOutputFormat.setOutputPath(conf, outDir);
    conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
    conf.setInt(
        org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
            .FILEOUTPUTCOMMITTER_ALGORITHM_VERSION,
        version);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());
    TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter();

    // do setup
    committer.setupJob(jContext);
    committer.setupTask(tContext);

    // write output
    TextOutputFormat theOutputFormat = new TextOutputFormat();
    RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(null, conf, partFile, null);
    writeOutput(theRecordWriter, tContext);

    // do abort
    committer.abortTask(tContext);
    File out = new File(outDir.toUri().getPath());
    Path workPath = committer.getWorkPath(tContext, outDir);
    File wp = new File(workPath.toUri().getPath());
    File expectedFile = new File(wp, partFile);
    assertFalse("task temp dir still exists", expectedFile.exists());

    committer.abortJob(jContext, JobStatus.State.FAILED);
    expectedFile = new File(out, FileOutputCommitter.TEMP_DIR_NAME);
    assertFalse("job temp dir still exists", expectedFile.exists());
    assertEquals("Output directory not empty", 0, out.listFiles().length);
    FileUtil.fullyDelete(out);
  }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }
Example #9
0
  static List<String> getClassPaths(
      JobConf conf, File workDir, TaskDistributedCacheManager taskDistributedCacheManager)
      throws IOException {
    // Accumulates class paths for child.
    List<String> classPaths = new ArrayList<String>();

    boolean userClassesTakesPrecedence = conf.userClassesTakesPrecedence();

    if (!userClassesTakesPrecedence) {
      // start with same classpath as parent process
      appendSystemClasspaths(classPaths);
    }

    // include the user specified classpath
    appendJobJarClasspaths(conf.getJar(), classPaths);

    // Distributed cache paths
    if (taskDistributedCacheManager != null)
      classPaths.addAll(taskDistributedCacheManager.getClassPaths());

    // Include the working dir too
    classPaths.add(workDir.toString());

    if (userClassesTakesPrecedence) {
      // parent process's classpath is added last
      appendSystemClasspaths(classPaths);
    }

    return classPaths;
  }
Example #10
0
  // Mostly for setting up the symlinks. Note that when we setup the distributed
  // cache, we didn't create the symlinks. This is done on a per task basis
  // by the currently executing task.
  public static void setupWorkDir(JobConf conf) throws IOException {
    File workDir = new File(".").getAbsoluteFile();
    FileUtil.fullyDelete(workDir);
    if (DistributedCache.getSymlink(conf)) {
      URI[] archives = DistributedCache.getCacheArchives(conf);
      URI[] files = DistributedCache.getCacheFiles(conf);
      Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
      Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
      if (archives != null) {
        for (int i = 0; i < archives.length; i++) {
          String link = archives[i].getFragment();
          if (link != null) {
            link = workDir.toString() + Path.SEPARATOR + link;
            File flink = new File(link);
            if (!flink.exists()) {
              FileUtil.symLink(localArchives[i].toString(), link);
            }
          }
        }
      }
      if (files != null) {
        for (int i = 0; i < files.length; i++) {
          String link = files[i].getFragment();
          if (link != null) {
            link = workDir.toString() + Path.SEPARATOR + link;
            File flink = new File(link);
            if (!flink.exists()) {
              FileUtil.symLink(localFiles[i].toString(), link);
            }
          }
        }
      }
    }
    File jobCacheDir = null;
    if (conf.getJar() != null) {
      jobCacheDir = new File(new Path(conf.getJar()).getParent().toString());
    }

    // create symlinks for all the files in job cache dir in current
    // workingdir for streaming
    try {
      DistributedCache.createAllSymlink(conf, jobCacheDir, workDir);
    } catch (IOException ie) {
      // Do not exit even if symlinks have not been created.
      LOG.warn(StringUtils.stringifyException(ie));
    }
    // add java.io.tmpdir given by mapred.child.tmp
    String tmp = conf.get("mapred.child.tmp", "./tmp");
    Path tmpDir = new Path(tmp);

    // if temp directory path is not absolute
    // prepend it with workDir.
    if (!tmpDir.isAbsolute()) {
      tmpDir = new Path(workDir.toString(), tmp);
      FileSystem localFs = FileSystem.getLocal(conf);
      if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) {
        throw new IOException("Mkdirs failed to create " + tmpDir.toString());
      }
    }
  }
  public void configure(JobConf conf) {
    /*
     * It reads all the configurations and distributed cache from outside.
     */

    // Read number of nodes in input layer and output layer from configuration
    inputNumdims = conf.get("numdims");
    inputNumhid = conf.get("numhid");

    // Read the weights from distributed cache
    Path[] pathwaysFiles = new Path[0];
    try {
      pathwaysFiles = DistributedCache.getLocalCacheFiles(conf);
      for (Path path : pathwaysFiles) {
        /*
         * this loop reads all the distributed cache files
         * In fact, the driver program ensures that there is only one distributed cache file
         */
        BufferedReader fis = new BufferedReader(new FileReader(path.toString()));
        weightline = fis.readLine();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (conf == null) {
      return -1;
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);

    conf.setPartitionerClass(TotalOrderPartitioner.class);

    InputSampler.Sampler<IntWritable, Text> sampler =
        new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10);

    Path input = FileInputFormat.getInputPaths(conf)[0];
    input = input.makeQualified(input.getFileSystem(conf));

    Path partitionFile = new Path(input, "_partitions");
    TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
    InputSampler.writePartitionFile(conf, sampler);

    // Add to DistributedCache
    URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
    DistributedCache.addCacheFile(partitionUri, conf);
    DistributedCache.createSymlink(conf);

    JobClient.runJob(conf);
    return 0;
  }
Example #13
0
  /**
   * Driver to copy srcPath to destPath depending on required protocol.
   *
   * @param args arguments
   */
  static void copy(final Configuration conf, final Arguments args) throws IOException {
    LOG.info("srcPaths=" + args.srcs);
    LOG.info("destPath=" + args.dst);
    checkSrcPath(conf, args.srcs);

    JobConf job = createJobConf(conf);
    if (args.preservedAttributes != null) {
      job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes);
    }
    if (args.mapredSslConf != null) {
      job.set("dfs.https.client.keystore.resource", args.mapredSslConf);
    }

    // Initialize the mapper
    try {
      setup(conf, job, args);
      JobClient.runJob(job);
      finalize(conf, job, args.dst, args.preservedAttributes);
    } finally {
      // delete tmp
      fullyDelete(job.get(TMP_DIR_LABEL), job);
      // delete jobDirectory
      fullyDelete(job.get(JOB_DIR_LABEL), job);
    }
  }
  private void testMapOnlyNoOutputInternal(int version) throws Exception {
    JobConf conf = new JobConf();
    // This is not set on purpose. FileOutputFormat.setOutputPath(conf, outDir);
    conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
    conf.setInt(
        org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
            .FILEOUTPUTCOMMITTER_ALGORITHM_VERSION,
        version);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());
    TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter();

    // setup
    committer.setupJob(jContext);
    committer.setupTask(tContext);

    if (committer.needsTaskCommit(tContext)) {
      // do commit
      committer.commitTask(tContext);
    }
    committer.commitJob(jContext);

    // validate output
    FileUtil.fullyDelete(new File(outDir.toString()));
  }
Example #15
0
    public Job(JobID jobid, String jobSubmitDir) throws IOException {
      this.systemJobDir = new Path(jobSubmitDir);
      this.systemJobFile = new Path(systemJobDir, "job.xml");
      this.id = jobid;

      this.localFs = FileSystem.getLocal(conf);

      this.localJobDir = localFs.makeQualified(conf.getLocalPath(jobDir));
      this.localJobFile = new Path(this.localJobDir, id + ".xml");

      // Manage the distributed cache.  If there are files to be copied,
      // this will trigger localFile to be re-written again.
      this.trackerDistributedCacheManager =
          new TrackerDistributedCacheManager(conf, taskController);
      this.taskDistributedCacheManager =
          trackerDistributedCacheManager.newTaskDistributedCacheManager(jobid, conf);
      taskDistributedCacheManager.setupCache(conf, "archive", "archive");

      if (DistributedCache.getSymlink(conf)) {
        // This is not supported largely because,
        // for a Child subprocess, the cwd in LocalJobRunner
        // is not a fresh slate, but rather the user's working directory.
        // This is further complicated because the logic in
        // setupWorkDir only creates symlinks if there's a jarfile
        // in the configuration.
        LOG.warn("LocalJobRunner does not support " + "symlinking into current working dir.");
      }
      // Setup the symlinks for the distributed cache.
      TaskRunner.setupWorkDir(conf, new File(localJobDir.toUri()).getAbsoluteFile());

      // Write out configuration file.  Instead of copying it from
      // systemJobFile, we re-write it, since setup(), above, may have
      // updated it.
      OutputStream out = localFs.create(localJobFile);
      try {
        conf.writeXml(out);
      } finally {
        out.close();
      }
      this.job = new JobConf(localJobFile);

      // Job (the current object) is a Thread, so we wrap its class loader.
      if (!taskDistributedCacheManager.getClassPaths().isEmpty()) {
        setContextClassLoader(taskDistributedCacheManager.makeClassLoader(getContextClassLoader()));
      }

      profile =
          new JobProfile(
              job.getUser(),
              id,
              systemJobFile.toString(),
              "http://localhost:8080/",
              job.getJobName());
      status = new JobStatus(id, 0.0f, 0.0f, JobStatus.RUNNING);

      jobs.put(id, this);

      this.start();
    }
Example #16
0
  private MiniMRCluster startCluster(JobConf conf, int numTrackers) throws IOException {
    conf.setLong("mapred.job.tracker.retiredjobs.cache.size", 1);
    conf.setLong("mapred.jobtracker.retirejob.interval", 0);
    conf.setLong("mapred.jobtracker.retirejob.check", 0);
    conf.getLong("mapred.jobtracker.completeuserjobs.maximum", 0);

    return new MiniMRCluster(0, 0, numTrackers, "file:///", 1, null, null, null, conf, 0);
  }
Example #17
0
 public void configure(JobConf job) {
   this.jobConf = job;
   urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
   interval = jobConf.getInt("db.fetch.interval.default", 2592000);
   filters = new URLFilters(jobConf);
   scfilters = new ScoringFilters(jobConf);
   scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
   curTime = job.getLong("injector.current.time", System.currentTimeMillis());
 }
Example #18
0
  @Test(timeout = 20000)
  public void testWarnCommandOpts() throws Exception {
    Logger logger = Logger.getLogger(YARNRunner.class);

    ByteArrayOutputStream bout = new ByteArrayOutputStream();
    Layout layout = new SimpleLayout();
    Appender appender = new WriterAppender(layout, bout);
    logger.addAppender(appender);

    JobConf jobConf = new JobConf();

    jobConf.set(
        MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS,
        "-Djava.net.preferIPv4Stack=true -Djava.library.path=foo");
    jobConf.set(MRJobConfig.MR_AM_COMMAND_OPTS, "-Xmx1024m -Djava.library.path=bar");

    YARNRunner yarnRunner = new YARNRunner(jobConf);

    File jobxml = new File(testWorkDir, MRJobConfig.JOB_CONF_FILE);
    OutputStream out = new FileOutputStream(jobxml);
    conf.writeXml(out);
    out.close();

    File jobsplit = new File(testWorkDir, MRJobConfig.JOB_SPLIT);
    out = new FileOutputStream(jobsplit);
    out.close();

    File jobsplitmetainfo = new File(testWorkDir, MRJobConfig.JOB_SPLIT_METAINFO);
    out = new FileOutputStream(jobsplitmetainfo);
    out.close();

    File appTokens = new File(testWorkDir, MRJobConfig.APPLICATION_TOKENS_FILE);
    out = new FileOutputStream(appTokens);
    out.close();

    @SuppressWarnings("unused")
    ApplicationSubmissionContext submissionContext =
        yarnRunner.createApplicationSubmissionContext(
            jobConf, testWorkDir.toString(), new Credentials());

    String logMsg = bout.toString();
    assertTrue(
        logMsg.contains(
            "WARN - Usage of -Djava.library.path in "
                + "yarn.app.mapreduce.am.admin-command-opts can cause programs to no "
                + "longer function if hadoop native libraries are used. These values "
                + "should be set as part of the LD_LIBRARY_PATH in the app master JVM "
                + "env using yarn.app.mapreduce.am.admin.user.env config settings."));
    assertTrue(
        logMsg.contains(
            "WARN - Usage of -Djava.library.path in "
                + "yarn.app.mapreduce.am.command-opts can cause programs to no longer "
                + "function if hadoop native libraries are used. These values should "
                + "be set as part of the LD_LIBRARY_PATH in the app master JVM env "
                + "using yarn.app.mapreduce.am.env config settings."));
  }
Example #19
0
 @Override
 protected void setUp() throws Exception {
   JobConf conf = new JobConf();
   conf.set(JTConfig.JT_IPC_ADDRESS, "localhost:0");
   conf.set(JTConfig.JT_HTTP_ADDRESS, "0.0.0.0:0");
   conf.setLong(JTConfig.JT_TRACKER_EXPIRY_INTERVAL, 1000);
   conf.set(JTConfig.JT_MAX_TRACKER_BLACKLISTS, "1");
   jobTracker = new FakeJobTracker(conf, (clock = new FakeClock()), trackers);
   jobTracker.startExpireTrackersThread();
 }
Example #20
0
 /**
  * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size
  * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at
  * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster).
  *
  * @param totalBytes Count of total bytes for job
  * @param job The job to configure
  * @return Count of maps to run.
  */
 private static void setMapCount(long totalBytes, JobConf job) throws IOException {
   int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP));
   numMaps =
       Math.min(
           numMaps,
           job.getInt(
               MAX_MAPS_LABEL,
               MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers()));
   job.setNumMapTasks(Math.max(numMaps, 1));
 }
Example #21
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
 public TaskTrackerMetricsInst(TaskTracker t) {
   super(t);
   JobConf conf = tt.getJobConf();
   String sessionId = conf.getSessionId();
   // Initiate Java VM Metrics
   JvmMetrics.init("TaskTracker", sessionId);
   // Create a record for Task Tracker metrics
   MetricsContext context = MetricsUtil.getContext("mapred");
   metricsRecord = MetricsUtil.createRecord(context, "tasktracker"); // guaranteed never null
   metricsRecord.setTag("sessionId", sessionId);
   context.registerUpdater(this);
 }
 /** Configure a job given argv. */
 public static boolean parseArgs(String[] argv, JobConf job) throws IOException {
   if (argv.length < 1) {
     return 0 == printUsage();
   }
   for (int i = 0; i < argv.length; ++i) {
     if (argv.length == i + 1) {
       System.out.println("ERROR: Required parameter missing from " + argv[i]);
       return 0 == printUsage();
     }
     try {
       if ("-m".equals(argv[i])) {
         job.setNumMapTasks(Integer.parseInt(argv[++i]));
       } else if ("-r".equals(argv[i])) {
         job.setNumReduceTasks(Integer.parseInt(argv[++i]));
       } else if ("-inFormat".equals(argv[i])) {
         job.setInputFormat(Class.forName(argv[++i]).asSubclass(InputFormat.class));
       } else if ("-outFormat".equals(argv[i])) {
         job.setOutputFormat(Class.forName(argv[++i]).asSubclass(OutputFormat.class));
       } else if ("-outKey".equals(argv[i])) {
         job.setOutputKeyClass(Class.forName(argv[++i]).asSubclass(WritableComparable.class));
       } else if ("-outValue".equals(argv[i])) {
         job.setOutputValueClass(Class.forName(argv[++i]).asSubclass(Writable.class));
       } else if ("-keepmap".equals(argv[i])) {
         job.set(
             org.apache.hadoop.mapreduce.GenericMRLoadGenerator.MAP_PRESERVE_PERCENT, argv[++i]);
       } else if ("-keepred".equals(argv[i])) {
         job.set(
             org.apache.hadoop.mapreduce.GenericMRLoadGenerator.REDUCE_PRESERVE_PERCENT,
             argv[++i]);
       } else if ("-outdir".equals(argv[i])) {
         FileOutputFormat.setOutputPath(job, new Path(argv[++i]));
       } else if ("-indir".equals(argv[i])) {
         FileInputFormat.addInputPaths(job, argv[++i]);
       } else if ("-inFormatIndirect".equals(argv[i])) {
         job.setClass(
             org.apache.hadoop.mapreduce.GenericMRLoadGenerator.INDIRECT_INPUT_FORMAT,
             Class.forName(argv[++i]).asSubclass(InputFormat.class),
             InputFormat.class);
         job.setInputFormat(IndirectInputFormat.class);
       } else {
         System.out.println("Unexpected argument: " + argv[i]);
         return 0 == printUsage();
       }
     } catch (NumberFormatException except) {
       System.out.println("ERROR: Integer expected instead of " + argv[i]);
       return 0 == printUsage();
     } catch (Exception e) {
       throw (IOException) new IOException().initCause(e);
     }
   }
   return true;
 }
 private long getTaskMemoryLimit(TaskAttemptID tid) {
   JobConf conf;
   synchronized (this.taskTracker) {
     conf = this.taskTracker.tasks.get(tid).getJobConf();
   }
   long taskMemoryLimit =
       tid.isMap()
           ? conf.getInt(
               JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, TASK_MAX_PHYSICAL_MEMORY_MB_DEFAULT)
           : conf.getInt(
               JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY, TASK_MAX_PHYSICAL_MEMORY_MB_DEFAULT);
   return taskMemoryLimit * 1024 * 1024L;
 }
Example #25
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
      long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
      String srcfilelist = job.get(SRC_LIST_LABEL, "");
      if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
        throw new RuntimeException(
            "Invalid metadata: #files("
                + cnfiles
                + ") total_size("
                + cbsize
                + ") listuri("
                + srcfilelist
                + ")");
      }
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(job);
      FileStatus srcst = fs.getFileStatus(src);

      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      FilePair value = new FilePair();
      final long targetsize = cbsize / numSplits;
      long pos = 0L;
      long last = 0L;
      long acc = 0L;
      long cbrem = srcst.getLen();
      SequenceFile.Reader sl = null;
      try {
        sl = new SequenceFile.Reader(fs, src, job);
        for (; sl.next(key, value); last = sl.getPosition()) {
          // if adding this split would put this split past the target size,
          // cut the last split and put this next file in the next split.
          if (acc + key.get() > targetsize && acc != 0) {
            long splitsize = last - pos;
            splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
            cbrem -= splitsize;
            pos = last;
            acc = 0L;
          }
          acc += key.get();
        }
      } finally {
        checkAndClose(sl);
      }
      if (cbrem != 0) {
        splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
      }

      return splits.toArray(new FileSplit[splits.size()]);
    }
Example #26
0
 static JobConf configureJobConf(
     JobConf conf,
     String namenode,
     int jobTrackerPort,
     int jobTrackerInfoPort,
     UserGroupInformation ugi) {
   JobConf result = new JobConf(conf);
   FileSystem.setDefaultUri(result, namenode);
   result.set("mapred.job.tracker", "localhost:" + jobTrackerPort);
   result.set("mapred.job.tracker.http.address", "127.0.0.1:" + jobTrackerInfoPort);
   // for debugging have all task output sent to the test output
   JobClient.setTaskOutputFilter(result, JobClient.TaskStatusFilter.ALL);
   return result;
 }
  public static void runJob(String[] args) {
    JobConf conf = new JobConf(CassandraBulkLoader.class);

    if (args.length >= 4) {
      conf.setNumReduceTasks(new Integer(args[3]));
    }

    try {
      // We store the cassandra storage-conf.xml on the HDFS cluster
      DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf);
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setJobName("CassandraBulkLoader_v2");
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[1]));
    FileOutputFormat.setOutputPath(conf, new Path(args[2]));
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  public void testFailAbort() throws IOException {
    JobConf job = new JobConf();
    job.set(FileSystem.FS_DEFAULT_NAME_KEY, "faildel:///");
    job.setClass("fs.faildel.impl", FakeFileSystem.class, FileSystem.class);
    setConfForFileOutputCommitter(job);
    JobContext jContext = new JobContextImpl(job, taskID.getJobID());
    TaskAttemptContext tContext = new TaskAttemptContextImpl(job, taskID);
    FileOutputCommitter committer = new FileOutputCommitter();
    FileOutputFormat.setWorkOutputPath(job, committer.getTempTaskOutputPath(tContext));

    // do setup
    committer.setupJob(jContext);
    committer.setupTask(tContext);
    String file = "test.txt";

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
    // write output
    FileSystem localFs = new FakeFileSystem();
    TextOutputFormat theOutputFormat = new TextOutputFormat();
    RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(localFs, job, file, reporter);
    writeOutput(theRecordWriter, reporter);

    // do abort
    Throwable th = null;
    try {
      committer.abortTask(tContext);
    } catch (IOException ie) {
      th = ie;
    }
    assertNotNull(th);
    assertTrue(th instanceof IOException);
    assertTrue(th.getMessage().contains("fake delete failed"));
    File jobTmpDir = new File(new Path(outDir, FileOutputCommitter.TEMP_DIR_NAME).toString());
    File taskTmpDir = new File(jobTmpDir, "_" + taskID);
    File expectedFile = new File(taskTmpDir, file);
    assertTrue(expectedFile + " does not exists", expectedFile.exists());

    th = null;
    try {
      committer.abortJob(jContext, JobStatus.State.FAILED);
    } catch (IOException ie) {
      th = ie;
    }
    assertNotNull(th);
    assertTrue(th instanceof IOException);
    assertTrue(th.getMessage().contains("fake delete failed"));
    assertTrue("job temp dir does not exists", jobTmpDir.exists());
  }
Example #29
0
 static JobConf configureJobConf(
     JobConf conf,
     String namenode,
     int jobTrackerPort,
     int jobTrackerInfoPort,
     UserGroupInformation ugi) {
   JobConf result = new JobConf(conf);
   FileSystem.setDefaultUri(result, namenode);
   result.set(MRConfig.FRAMEWORK_NAME, MRConfig.CLASSIC_FRAMEWORK_NAME);
   result.set(JTConfig.JT_IPC_ADDRESS, "localhost:" + jobTrackerPort);
   result.set(JTConfig.JT_HTTP_ADDRESS, "127.0.0.1:" + jobTrackerInfoPort);
   // for debugging have all task output sent to the test output
   JobClient.setTaskOutputFilter(result, JobClient.TaskStatusFilter.ALL);
   return result;
 }
Example #30
0
  private static void finalize(
      Configuration conf, JobConf jobconf, final Path destPath, String presevedAttributes)
      throws IOException {
    if (presevedAttributes == null) {
      return;
    }
    EnumSet<FileAttribute> preseved = FileAttribute.parse(presevedAttributes);
    if (!preseved.contains(FileAttribute.USER)
        && !preseved.contains(FileAttribute.GROUP)
        && !preseved.contains(FileAttribute.PERMISSION)) {
      return;
    }

    FileSystem dstfs = destPath.getFileSystem(conf);
    Path dstdirlist = new Path(jobconf.get(DST_DIR_LIST_LABEL));
    SequenceFile.Reader in = null;
    try {
      in = new SequenceFile.Reader(dstdirlist.getFileSystem(jobconf), dstdirlist, jobconf);
      Text dsttext = new Text();
      FilePair pair = new FilePair();
      for (; in.next(dsttext, pair); ) {
        Path absdst = new Path(destPath, pair.output);
        updatePermissions(pair.input, dstfs.getFileStatus(absdst), preseved, dstfs);
      }
    } finally {
      checkAndClose(in);
    }
  }