Пример #1
0
 @SuppressWarnings("unchecked")
 @Override
 public void setStoreLocation(String location, Job job) throws IOException {
   log.debug("setStoreLocation({}, {})", location, job);
   job.getConfiguration().set("mapred.textoutputformat.separator", "");
   FileOutputFormat.setOutputPath(job, new Path(location));
   if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) {
     FileOutputFormat.setCompressOutput(job, true);
     String codec = job.getConfiguration().get("output.compression.codec");
     try {
       FileOutputFormat.setOutputCompressorClass(
           job, (Class<? extends CompressionCodec>) Class.forName(codec));
     } catch (ClassNotFoundException e) {
       throw new RuntimeException("Class not found: " + codec);
     }
   } else {
     if (location.endsWith(".bz2") || location.endsWith(".bz")) {
       FileOutputFormat.setCompressOutput(job, true);
       FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
     } else if (location.endsWith(".gz")) {
       FileOutputFormat.setCompressOutput(job, true);
       FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
     } else {
       FileOutputFormat.setCompressOutput(job, false);
     }
   }
 }
  public static void main(String[] args)
      throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job();
    job.setJarByClass(MaxTemperature.class);

    String inputPath =
        "/home/cloudera/hd/data/ncdc_tmp/ftp.ncdc.noaa.gov/pub/data/noaa/2000/719043-99999-2000.gz";
    String outputPath = "/home/cloudera/hd/data/output";

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.setMapperClass(MaxTemperatureMapper.class);
    job.setReducerClass(MaxTemperatureReducer.class);
    job.setCombinerClass(MaxTemperatureReducer.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
Пример #3
0
  @SuppressWarnings("rawtypes")
  public void afterPropertiesSet() throws Exception {
    final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);

    buildGenericOptions(cfg);

    if (StringUtils.hasText(user)) {
      UserGroupInformation ugi =
          UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());
      ugi.doAs(
          new PrivilegedExceptionAction<Void>() {

            @Override
            public Void run() throws Exception {
              job = new Job(cfg);
              return null;
            }
          });
    } else {
      job = new Job(cfg);
    }

    ClassLoader loader =
        (beanClassLoader != null
            ? beanClassLoader
            : org.springframework.util.ClassUtils.getDefaultClassLoader());

    if (jar != null) {
      JobConf conf = (JobConf) job.getConfiguration();
      conf.setJar(jar.getURI().toString());
      loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg);
      conf.setClassLoader(loader);
    }

    // set first to enable auto-detection of K/V to skip the key/value types to be specified
    if (mapper != null) {
      Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class);
      job.setMapperClass(mapperClass);
      configureMapperTypesIfPossible(job, mapperClass);
    }

    if (reducer != null) {
      Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class);
      job.setReducerClass(reducerClass);
      configureReducerTypesIfPossible(job, reducerClass);
    }

    if (StringUtils.hasText(name)) {
      job.setJobName(name);
    }
    if (combiner != null) {
      job.setCombinerClass(resolveClass(combiner, loader, Reducer.class));
    }
    if (groupingComparator != null) {
      job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class));
    }
    if (inputFormat != null) {
      job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class));
    }
    if (mapKey != null) {
      job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class));
    }
    if (mapValue != null) {
      job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class));
    }
    if (numReduceTasks != null) {
      job.setNumReduceTasks(numReduceTasks);
    }
    if (key != null) {
      job.setOutputKeyClass(resolveClass(key, loader, Object.class));
    }
    if (value != null) {
      job.setOutputValueClass(resolveClass(value, loader, Object.class));
    }
    if (outputFormat != null) {
      job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class));
    }
    if (partitioner != null) {
      job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class));
    }
    if (sortComparator != null) {
      job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class));
    }
    if (StringUtils.hasText(workingDir)) {
      job.setWorkingDirectory(new Path(workingDir));
    }
    if (jarClass != null) {
      job.setJarByClass(jarClass);
    }

    if (!CollectionUtils.isEmpty(inputPaths)) {
      for (String path : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(path));
      }
    }

    if (StringUtils.hasText(outputPath)) {
      FileOutputFormat.setOutputPath(job, new Path(outputPath));
    }

    if (compressOutput != null) {
      FileOutputFormat.setCompressOutput(job, compressOutput);
    }

    if (codecClass != null) {
      FileOutputFormat.setOutputCompressorClass(
          job, resolveClass(codecClass, loader, CompressionCodec.class));
    }

    processJob(job);
  }
Пример #4
0
  @Override
  public int run(String[] args) throws Exception {
    Job job1 = Job.getInstance(getConf(), "TermWordCountPerDocument");
    job1.setJarByClass(getClass());
    Configuration conf1 = job1.getConfiguration();
    FileInputFormat.setInputPaths(job1, new Path("enron/mann.avro"));
    Path out1 = new Path("tfidf/step1");
    out1.getFileSystem(conf1).delete(out1, true);
    FileOutputFormat.setOutputPath(job1, out1);
    FileOutputFormat.setOutputCompressorClass(job1, SnappyCodec.class);
    FileOutputFormat.setCompressOutput(job1, true);

    job1.setMapperClass(TermWordCountPerDocumentMapper.class);
    job1.setReducerClass(IntSumReducer.class);
    job1.setInputFormatClass(AvroKeyInputFormat.class);
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(IntWritable.class);

    Job job2 = Job.getInstance(getConf(), "DocumentWordCount");
    job2.setJarByClass(getClass());
    Configuration conf2 = job2.getConfiguration();
    FileInputFormat.setInputPaths(job2, new Path("tfidf/step1"));
    Path out2 = new Path("tfidf/step2");
    out2.getFileSystem(conf2).delete(out2, true);
    FileOutputFormat.setOutputPath(job2, out2);
    FileOutputFormat.setOutputCompressorClass(job2, SnappyCodec.class);
    FileOutputFormat.setCompressOutput(job2, true);

    job2.setMapperClass(DocumentWordCountMapper.class);
    job2.setReducerClass(DocumentWordCountReducer.class);
    job2.setInputFormatClass(SequenceFileInputFormat.class);
    job2.setOutputFormatClass(SequenceFileOutputFormat.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);

    Job job3 = Job.getInstance(getConf(), "DocumentCountAndTfIdf");
    job3.setJarByClass(getClass());
    Configuration conf3 = job3.getConfiguration();
    FileInputFormat.setInputPaths(job3, new Path("tfidf/step2"));
    Path out3 = new Path("tfidf/final");
    out3.getFileSystem(conf3).delete(out3, true);
    FileOutputFormat.setOutputPath(job3, out3);
    FileOutputFormat.setOutputCompressorClass(job3, SnappyCodec.class);
    FileOutputFormat.setCompressOutput(job3, true);

    // Get the total document count from the Avro file metadata
    DataFileReader<Object> reader =
        new DataFileReader<Object>(
            new FsInput(new Path("enron/mann.avro"), conf3), new GenericDatumReader<Object>());
    conf3.setLong("totalDocs", reader.getMetaLong("recordCount"));
    reader.close();

    job3.setMapperClass(TermDocumentCountMapper.class);
    job3.setReducerClass(TfIdfReducer.class);
    job3.setInputFormatClass(SequenceFileInputFormat.class);
    job3.setOutputFormatClass(SequenceFileOutputFormat.class);
    job3.setOutputKeyClass(Text.class);
    job3.setOutputValueClass(Text.class);

    return 0;
  }