Esempio n. 1
0
 @SuppressWarnings("unchecked")
 @Override
 public void setStoreLocation(String location, Job job) throws IOException {
   log.debug("setStoreLocation({}, {})", location, job);
   job.getConfiguration().set("mapred.textoutputformat.separator", "");
   FileOutputFormat.setOutputPath(job, new Path(location));
   if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) {
     FileOutputFormat.setCompressOutput(job, true);
     String codec = job.getConfiguration().get("output.compression.codec");
     try {
       FileOutputFormat.setOutputCompressorClass(
           job, (Class<? extends CompressionCodec>) Class.forName(codec));
     } catch (ClassNotFoundException e) {
       throw new RuntimeException("Class not found: " + codec);
     }
   } else {
     if (location.endsWith(".bz2") || location.endsWith(".bz")) {
       FileOutputFormat.setCompressOutput(job, true);
       FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
     } else if (location.endsWith(".gz")) {
       FileOutputFormat.setCompressOutput(job, true);
       FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
     } else {
       FileOutputFormat.setCompressOutput(job, false);
     }
   }
 }
Esempio n. 2
0
  public Job getJob(Configuration conf) throws IOException {
    Job job = new Job(conf, "pivoting");

    job.setJarByClass(PivotingReducer.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(PivotingReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(RuleWritable.class);
    job.setMapOutputValueClass(MapWritable.class);
    job.setOutputKeyClass(RuleWritable.class);
    job.setOutputValueClass(MapWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setPartitionerClass(RuleWritable.SourcePartitioner.class);

    FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "collected"));
    int maxSplitSize = conf.getInt("thrax.max-split-size", 0);
    if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);

    int numReducers = conf.getInt("thrax.reducers", 4);
    job.setNumReduceTasks(numReducers);

    FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "pivoted"));
    FileOutputFormat.setCompressOutput(job, true);

    return job;
  }
  /** Runs this tool. */
  public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options =
        DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    if (options == null) {
      return -1;
    }

    // Temp directory.
    String tmpDir =
        "tmp-" + TrecDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool name: " + TrecDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job =
        new Job(
            getConf(), TrecDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection);
    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(TrecDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(options.collection));
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
      job.waitForCompletion(true);
    } catch (Exception e) {
      throw new RuntimeException(e);
    }

    String input = tmpDir + (tmpDir.endsWith("/") ? "" : "/") + "/part-r-00000";
    TrecDocnoMapping.writeMappingData(
        new Path(input), new Path(options.docnoMapping), FileSystem.get(getConf()));

    fs.delete(new Path(tmpDir), true);

    return 0;
  }
  /** Runs this tool. */
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];

    LOG.info("Tool: " + Aquaint2DocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    Job job = new Job(getConf(), Aquaint2DocnoMappingBuilder.class.getSimpleName());
    job.setJarByClass(Aquaint2DocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(Aquaint2DocumentInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    String input = outputPath + (outputPath.endsWith("/") ? "" : "/") + "/part-r-00000";
    Aquaint2DocnoMapping.writeDocnoData(
        new Path(input), new Path(outputFile), FileSystem.get(getConf()));

    return 0;
  }
  public static void main(String[] args)
      throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job();
    job.setJarByClass(MaxTemperature.class);

    String inputPath =
        "/home/cloudera/hd/data/ncdc_tmp/ftp.ncdc.noaa.gov/pub/data/noaa/2000/719043-99999-2000.gz";
    String outputPath = "/home/cloudera/hd/data/output";

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.setMapperClass(MaxTemperatureMapper.class);
    job.setReducerClass(MaxTemperatureReducer.class);
    job.setCombinerClass(MaxTemperatureReducer.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
  @SuppressWarnings("rawtypes")
  public void afterPropertiesSet() throws Exception {
    final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);

    buildGenericOptions(cfg);

    if (StringUtils.hasText(user)) {
      UserGroupInformation ugi =
          UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());
      ugi.doAs(
          new PrivilegedExceptionAction<Void>() {

            @Override
            public Void run() throws Exception {
              job = new Job(cfg);
              return null;
            }
          });
    } else {
      job = new Job(cfg);
    }

    ClassLoader loader =
        (beanClassLoader != null
            ? beanClassLoader
            : org.springframework.util.ClassUtils.getDefaultClassLoader());

    if (jar != null) {
      JobConf conf = (JobConf) job.getConfiguration();
      conf.setJar(jar.getURI().toString());
      loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg);
      conf.setClassLoader(loader);
    }

    // set first to enable auto-detection of K/V to skip the key/value types to be specified
    if (mapper != null) {
      Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class);
      job.setMapperClass(mapperClass);
      configureMapperTypesIfPossible(job, mapperClass);
    }

    if (reducer != null) {
      Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class);
      job.setReducerClass(reducerClass);
      configureReducerTypesIfPossible(job, reducerClass);
    }

    if (StringUtils.hasText(name)) {
      job.setJobName(name);
    }
    if (combiner != null) {
      job.setCombinerClass(resolveClass(combiner, loader, Reducer.class));
    }
    if (groupingComparator != null) {
      job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class));
    }
    if (inputFormat != null) {
      job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class));
    }
    if (mapKey != null) {
      job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class));
    }
    if (mapValue != null) {
      job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class));
    }
    if (numReduceTasks != null) {
      job.setNumReduceTasks(numReduceTasks);
    }
    if (key != null) {
      job.setOutputKeyClass(resolveClass(key, loader, Object.class));
    }
    if (value != null) {
      job.setOutputValueClass(resolveClass(value, loader, Object.class));
    }
    if (outputFormat != null) {
      job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class));
    }
    if (partitioner != null) {
      job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class));
    }
    if (sortComparator != null) {
      job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class));
    }
    if (StringUtils.hasText(workingDir)) {
      job.setWorkingDirectory(new Path(workingDir));
    }
    if (jarClass != null) {
      job.setJarByClass(jarClass);
    }

    if (!CollectionUtils.isEmpty(inputPaths)) {
      for (String path : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(path));
      }
    }

    if (StringUtils.hasText(outputPath)) {
      FileOutputFormat.setOutputPath(job, new Path(outputPath));
    }

    if (compressOutput != null) {
      FileOutputFormat.setCompressOutput(job, compressOutput);
    }

    if (codecClass != null) {
      FileOutputFormat.setOutputCompressorClass(
          job, resolveClass(codecClass, loader, CompressionCodec.class));
    }

    processJob(job);
  }
  /**
   * Implmentation of Tool.run() method, which builds and runs the Hadoop job.
   *
   * @param args command line parameters, less common Hadoop job parameters stripped out and
   *     interpreted by the Tool class.
   * @return 0 if the Hadoop job completes successfully, 1 if not.
   */
  @Override
  public int run(String[] args) throws Exception {

    String inputPath = null;
    String outputPath = null;
    String configFile = null;
    boolean overwrite = false;
    int numReducers = 60;

    // Read the command line arguments. We're not using GenericOptionsParser
    // to prevent having to include commons.cli as a dependency.
    for (int i = 0; i < args.length; i++) {
      try {
        if (args[i].equals(ARGNAME_INPATH)) {
          inputPath = args[++i];
        } else if (args[i].equals(ARGNAME_OUTPATH)) {
          outputPath = args[++i];
        } else if (args[i].equals(ARGNAME_CONF)) {
          configFile = args[++i];
        } else if (args[i].equals(ARGNAME_MAXFILES)) {
          SampleFilter.setMax(Long.parseLong(args[++i]));
        } else if (args[i].equals(ARGNAME_OVERWRITE)) {
          overwrite = true;
        } else if (args[i].equals(ARGNAME_NUMREDUCE)) {
          numReducers = Integer.parseInt(args[++i]);
        } else {
          LOG.warn("Unsupported argument: " + args[i]);
        }
      } catch (ArrayIndexOutOfBoundsException e) {
        usage();
        throw new IllegalArgumentException();
      }
    }

    if (inputPath == null || outputPath == null) {
      usage();
      throw new IllegalArgumentException();
    }

    // Read in any additional config parameters.
    if (configFile != null) {
      LOG.info("adding config parameters from '" + configFile + "'");
      this.getConf().addResource(configFile);
    }

    // Create the Hadoop job.
    Configuration conf = getConf();
    Job job = new Job(conf);
    job.setJarByClass(BigramFinder.class);
    job.setNumReduceTasks(numReducers);

    // Scan the provided input path for ARC files.
    LOG.info("setting input path to '" + inputPath + "'");
    SampleFilter.setFilter(FILEFILTER);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputPathFilter(job, SampleFilter.class);

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
      LOG.info("clearing the output path at '" + outputPath + "'");
      FileSystem fs = FileSystem.get(new URI(outputPath), conf);
      if (fs.exists(new Path(outputPath))) {
        fs.delete(new Path(outputPath), true);
      }
    }

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    // Set which InputFormat class to use.
    job.setInputFormatClass(ArcInputFormat.class); // SequenceFileInputFormat.class

    // Set which OutputFormat class to use.
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(BigramFinderMapper.class);
    job.setReducerClass(PairGeneratorSumReducer.class);
    job.setCombinerClass(LongSumReducer.class);

    // Set the name of the job.
    job.setJobName("Norvig Award - Evil Bigram Finder");

    if (job.waitForCompletion(true)) {
      return 0;
    } else {
      return 1;
    }
  }
Esempio n. 8
0
 public static void enableCompression(Job j, SequenceFile.CompressionType type) {
   Configuration conf = j.getConfiguration();
   conf.setBoolean("mapred.compress.map.output", true);
   FileOutputFormat.setCompressOutput(j, true);
   SequenceFileOutputFormat.setOutputCompressionType(j, type);
 }
Esempio n. 9
0
  @Override
  public int run(String[] args) throws Exception {
    Job job1 = Job.getInstance(getConf(), "TermWordCountPerDocument");
    job1.setJarByClass(getClass());
    Configuration conf1 = job1.getConfiguration();
    FileInputFormat.setInputPaths(job1, new Path("enron/mann.avro"));
    Path out1 = new Path("tfidf/step1");
    out1.getFileSystem(conf1).delete(out1, true);
    FileOutputFormat.setOutputPath(job1, out1);
    FileOutputFormat.setOutputCompressorClass(job1, SnappyCodec.class);
    FileOutputFormat.setCompressOutput(job1, true);

    job1.setMapperClass(TermWordCountPerDocumentMapper.class);
    job1.setReducerClass(IntSumReducer.class);
    job1.setInputFormatClass(AvroKeyInputFormat.class);
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(IntWritable.class);

    Job job2 = Job.getInstance(getConf(), "DocumentWordCount");
    job2.setJarByClass(getClass());
    Configuration conf2 = job2.getConfiguration();
    FileInputFormat.setInputPaths(job2, new Path("tfidf/step1"));
    Path out2 = new Path("tfidf/step2");
    out2.getFileSystem(conf2).delete(out2, true);
    FileOutputFormat.setOutputPath(job2, out2);
    FileOutputFormat.setOutputCompressorClass(job2, SnappyCodec.class);
    FileOutputFormat.setCompressOutput(job2, true);

    job2.setMapperClass(DocumentWordCountMapper.class);
    job2.setReducerClass(DocumentWordCountReducer.class);
    job2.setInputFormatClass(SequenceFileInputFormat.class);
    job2.setOutputFormatClass(SequenceFileOutputFormat.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);

    Job job3 = Job.getInstance(getConf(), "DocumentCountAndTfIdf");
    job3.setJarByClass(getClass());
    Configuration conf3 = job3.getConfiguration();
    FileInputFormat.setInputPaths(job3, new Path("tfidf/step2"));
    Path out3 = new Path("tfidf/final");
    out3.getFileSystem(conf3).delete(out3, true);
    FileOutputFormat.setOutputPath(job3, out3);
    FileOutputFormat.setOutputCompressorClass(job3, SnappyCodec.class);
    FileOutputFormat.setCompressOutput(job3, true);

    // Get the total document count from the Avro file metadata
    DataFileReader<Object> reader =
        new DataFileReader<Object>(
            new FsInput(new Path("enron/mann.avro"), conf3), new GenericDatumReader<Object>());
    conf3.setLong("totalDocs", reader.getMetaLong("recordCount"));
    reader.close();

    job3.setMapperClass(TermDocumentCountMapper.class);
    job3.setReducerClass(TfIdfReducer.class);
    job3.setInputFormatClass(SequenceFileInputFormat.class);
    job3.setOutputFormatClass(SequenceFileOutputFormat.class);
    job3.setOutputKeyClass(Text.class);
    job3.setOutputValueClass(Text.class);

    return 0;
  }
Esempio n. 10
0
  @SuppressWarnings("unchecked")
  public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf);
    String corpusClass =
        MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf);
    String extractorClass =
        MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf);
    String extractorArgs =
        MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf);
    String extractorTarget =
        MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf)
            .toLowerCase();
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf);

    // split examples
    conf.set("Mavuno.Split.InputPath", inputPath);
    conf.set("Mavuno.Split.OutputPath", outputPath + "/../split");
    conf.set("Mavuno.Split.SplitKey", extractorTarget);
    new Split(conf).run();

    // get splits
    FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split");
    int split = 0;
    for (FileStatus file : files) {
      if (!file.getPath().getName().endsWith(".examples")) {
        continue;
      }

      conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString());

      sLogger.info("Tool name: ExtractGlobalStats");
      sLogger.info(" - Input path: " + inputPath);
      sLogger.info(" - Examples path: " + file.getPath());
      sLogger.info(" - Example split: " + split);
      sLogger.info(" - Corpus path: " + corpusPath);
      sLogger.info(" - Corpus class: " + corpusClass);
      sLogger.info(" - Extractor class: " + extractorClass);
      sLogger.info(" - Extractor class: " + extractorArgs);
      sLogger.info(" - Extractor target: " + extractorTarget);
      sLogger.info(" - Output path: " + outputPath);

      Job job = new Job(conf);
      job.setJobName("ExtractGlobalStats");
      job.setJarByClass(ExtractGlobalStats.class);

      MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
      FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split));

      job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
      job.setOutputFormatClass(SequenceFileOutputFormat.class);

      FileOutputFormat.setCompressOutput(job, true);
      SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

      job.setMapOutputKeyClass(ContextPatternWritable.class);
      job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
      job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
      job.setMapOutputValueClass(ContextPatternStatsWritable.class);

      job.setOutputKeyClass(ContextPatternWritable.class);
      job.setOutputValueClass(ContextPatternStatsWritable.class);

      job.setMapperClass(MyMapper.class);
      job.setReducerClass(MyReducer.class);

      job.waitForCompletion(true);

      split++;
    }

    // combine splits
    conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split);
    conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/");
    conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath);
    new CombineGlobalStats(conf).run();

    MavunoUtils.removeDirectory(conf, outputPath + "/../split");

    return 0;
  }