Example #1
0
 /**
  * Generate the requested number of file splits, with the filename set to the filename of the
  * output file.
  */
 public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
   /** 设置输入分片的个数* */
   JobClient client = new JobClient(job);
   ClusterStatus cluster = client.getClusterStatus();
   /** 如果属性不存在 则返回默认的值 * */
   int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
   long numBytesToWritePerMap =
       job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
   if (numBytesToWritePerMap == 0) {
     System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
   }
   long totalBytesToWrite =
       job.getLong(
           "test.randomwrite.total_bytes",
           numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
   int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
   if (numMaps == 0 && totalBytesToWrite > 0) {
     numMaps = 1;
   }
   System.out.println("numMaps-------" + numMaps);
   InputSplit[] result = new InputSplit[numMaps];
   Path outDir = FileOutputFormat.getOutputPath(job);
   for (int i = 0; i < result.length; ++i) {
     result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null);
   }
   return result;
 }
Example #2
0
 /**
  * This method call when injected into a class will modify the output path, only if output is into
  * HDFS
  *
  * @param job Job whose output path need to be changed
  */
 public static void modifyOutputPath(JobConf job) {
   Path path = org.apache.hadoop.mapred.FileOutputFormat.getOutputPath(job);
   if (path == null) {
     throw new IllegalArgumentException("Job Output path is null, expecting not null path value");
   }
   StringBuilder out = new StringBuilder(path.toString());
   out.append(SEPARATOR_UNDERSCORE).append(System.currentTimeMillis());
   org.apache.hadoop.mapred.FileOutputFormat.setOutputPath(job, new Path(out.toString()));
 }
    public RecordWriter<WritableComparable, Writable> getRecordWriter(
        final FileSystem fs, JobConf job, String name, final Progressable progress)
        throws IOException {

      final Path segmentDumpFile = new Path(FileOutputFormat.getOutputPath(job), name);

      // Get the old copy out of the way
      if (fs.exists(segmentDumpFile)) fs.delete(segmentDumpFile, true);

      final PrintStream printStream = new PrintStream(fs.create(segmentDumpFile));
      return new RecordWriter<WritableComparable, Writable>() {
        public synchronized void write(WritableComparable key, Writable value) throws IOException {
          printStream.println(value);
        }

        public synchronized void close(Reporter reporter) throws IOException {
          printStream.close();
        }
      };
    }
  public void configure(JobConf job) {

    _jobConf = job;

    crawlerCount =
        job.getInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, CrawlEnvironment.CRAWLERS.length);

    partitionNumber = job.getInt("mapred.task.partition", -1);

    try {
      FileSystem fs = FileSystem.get(job);
      Path workPath = FileOutputFormat.getOutputPath(job);
      debugURLStream =
          fs.create(new Path(workPath, "debugURLS-" + NUMBER_FORMAT.format(partitionNumber)));
      urlDebugURLWriter = new OutputStreamWriter(debugURLStream, Charset.forName("UTF-8"));
      _emittedURLSFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    } catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw new RuntimeException(e);
    }
  }
Example #5
0
    static void checkRecords(
        Configuration defaults, int noMaps, int noReduces, Path sortInput, Path sortOutput)
        throws IOException {
      JobConf jobConf = new JobConf(defaults, RecordChecker.class);
      jobConf.setJobName("sortvalidate-record-checker");

      jobConf.setInputFormat(SequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(BytesWritable.class);
      jobConf.setOutputValueClass(IntWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setReducerClass(Reduce.class);

      JobClient client = new JobClient(jobConf);
      ClusterStatus cluster = client.getClusterStatus();
      if (noMaps == -1) {
        noMaps = cluster.getTaskTrackers() * jobConf.getInt("test.sortvalidate.maps_per_host", 10);
      }
      if (noReduces == -1) {
        noReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sortvalidate.reduces_per_host");
        if (sortReduces != null) {
          noReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }
      }
      jobConf.setNumMapTasks(noMaps);
      jobConf.setNumReduceTasks(noReduces);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordchecker");
      FileSystem fs = FileSystem.get(defaults);
      if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordChecker: Running on "
              + cluster.getTaskTrackers()
              + " nodes to validate sort from "
              + inputPaths[0]
              + ", "
              + inputPaths[1]
              + " into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with "
              + noReduces
              + " reduces.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    }
Example #6
0
    static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput)
        throws IOException {
      FileSystem inputfs = sortInput.getFileSystem(defaults);
      FileSystem outputfs = sortOutput.getFileSystem(defaults);
      FileSystem defaultfs = FileSystem.get(defaults);
      JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class);
      jobConf.setJobName("sortvalidate-recordstats-checker");

      int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length;
      jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks);
      int noSortInputpaths = inputfs.listStatus(sortInput).length;

      jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(IntWritable.class);
      jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setCombinerClass(Reduce.class);
      jobConf.setReducerClass(Reduce.class);

      jobConf.setNumMapTasks(noSortReduceTasks);
      jobConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker");
      if (defaultfs.exists(outputPath)) {
        defaultfs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordStatsChecker: Validate sort "
              + "from "
              + inputPaths[0]
              + " ("
              + noSortInputpaths
              + " files), "
              + inputPaths[1]
              + " ("
              + noSortReduceTasks
              + " files) into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with 1 reducer.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");

      // Check to ensure that the statistics of the
      // framework's sort-input and sort-output match
      SequenceFile.Reader stats =
          new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults);
      IntWritable k1 = new IntWritable();
      IntWritable k2 = new IntWritable();
      RecordStatsWritable v1 = new RecordStatsWritable();
      RecordStatsWritable v2 = new RecordStatsWritable();
      if (!stats.next(k1, v1)) {
        throw new IOException("Failed to read record #1 from reduce's output");
      }
      if (!stats.next(k2, v2)) {
        throw new IOException("Failed to read record #2 from reduce's output");
      }

      if ((v1.getBytes() != v2.getBytes())
          || (v1.getRecords() != v2.getRecords())
          || v1.getChecksum() != v2.getChecksum()) {
        throw new IOException(
            "("
                + v1.getBytes()
                + ", "
                + v1.getRecords()
                + ", "
                + v1.getChecksum()
                + ") v/s ("
                + v2.getBytes()
                + ", "
                + v2.getRecords()
                + ", "
                + v2.getChecksum()
                + ")");
      }
    }