Esempio n. 1
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] remainArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (remainArgs.length != 2) {
      System.err.println("Usage: wordcount <input> <output>");
      System.exit(1);
    }

    Job job = new Job(conf, "wordcount");
    job.setJarByClass(WordCount.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    job.setNumReduceTasks(4);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileSystem.get(conf).delete(new Path(remainArgs[1]), true);

    FileInputFormat.setInputPaths(job, new Path(remainArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(remainArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    // vv MaxTemperatureDriverV6
    conf.setProfileEnabled(true);
    conf.setProfileParams(
        "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s");
    conf.setProfileTaskRange(true, "0-2");
    // ^^ MaxTemperatureDriverV6

    JobClient.runJob(conf);
    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (conf == null) {
      return -1;
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);

    conf.setPartitionerClass(TotalOrderPartitioner.class);

    InputSampler.Sampler<IntWritable, Text> sampler =
        new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10);

    Path input = FileInputFormat.getInputPaths(conf)[0];
    input = input.makeQualified(input.getFileSystem(conf));

    Path partitionFile = new Path(input, "_partitions");
    TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
    InputSampler.writePartitionFile(conf, sampler);

    // Add to DistributedCache
    URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
    DistributedCache.addCacheFile(partitionUri, conf);
    DistributedCache.createSymlink(conf);

    JobClient.runJob(conf);
    return 0;
  }
 private Job configureJob(Path secretsPath, Path saltFilePath, Path inputPath, Path outputPath)
     throws Exception {
   Job job = Job.getInstance(getConf());
   FileInputFormat.setInputPaths(job, inputPath);
   FileOutputFormat.setOutputPath(job, outputPath);
   job.getConfiguration().set(ObfuscateMapper.SECRET_WORDS_FILE_KEY, secretsPath.toString());
   job.getConfiguration().set(ObfuscateMapper.SALT_FILE_KEY, saltFilePath.toString());
   job.setInputFormatClass(TextInputFormat.class);
   job.setMapperClass(ObfuscateMapper.class);
   job.setNumReduceTasks(0);
   job.setJarByClass(getClass());
   FileSystem.get(outputPath.toUri(), getConf()).delete(outputPath, true);
   return job;
 }
Esempio n. 7
0
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(HadoopNBFilter.class);
    job.setJobName("hadoopnbfilter");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(Integer.parseInt(args[2]));
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    boolean jobCompleted = job.waitForCompletion(true);
    return jobCompleted ? 0 : 1;
  }
Esempio n. 8
0
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(Add1.class);
    conf.setJobName("sumar1");

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Esempio n. 9
0
  public int run(String[] args) throws Exception {
    Path tempDir = new Path("/user/akhfa/temp");

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(AuthorCounter.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, tempDir);
    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;
  }
Esempio n. 10
0
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Esempio n. 11
0
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), XiangLi1_exercise3.class);
    conf.setJobName("xiangli1_exercise3");
    conf.setNumReduceTasks(0);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    // conf.setCombinerClass(Reduce.class);
    // conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
    return 0;
  }
Esempio n. 12
0
  @Override
  public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    JobConf job = new JobConf(conf);
    job.setJarByClass(Jacobi.class);

    fs.delete(new Path("curX"), true);
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(job, new Path("preX"));
    FileOutputFormat.setOutputPath(job, new Path("curX"));

    JobClient.runJob(job);
    return 1;
  }
Esempio n. 13
0
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), Sqrt2.class);
    conf.setJobName("sqrt2");

    conf.setOutputKeyClass(DoubleWritable.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(Map.class);
    /*conf.setCombinerClass(Reduce.class);*/
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    job.setJarByClass(ElimiateRepeat.class);
    job.setJobName("ElimiateRepeat");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path("file0*"));
    FileOutputFormat.setOutputPath(job, new Path("elimiateRepeat"));

    boolean success = job.waitForCompletion(true);

    return success ? 0 : 1;
  }