Ejemplo n.º 1
0
 public Map<String, Object> run(Map<String, Object> args) throws Exception {
   getConf().setLong("injector.current.time", System.currentTimeMillis());
   Path input;
   Object path = args.get(Nutch.ARG_SEEDDIR);
   if (path instanceof Path) {
     input = (Path) path;
   } else {
     input = new Path(path.toString());
   }
   numJobs = 1;
   currentJobNum = 0;
   currentJob = new NutchJob(getConf(), "inject " + input);
   FileInputFormat.addInputPath(currentJob, input);
   currentJob.setMapperClass(UrlMapper.class);
   currentJob.setMapOutputKeyClass(String.class);
   currentJob.setMapOutputValueClass(WebPage.class);
   currentJob.setOutputFormatClass(GoraOutputFormat.class);
   DataStore<String, WebPage> store =
       StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class);
   GoraOutputFormat.setOutput(currentJob, store, true);
   currentJob.setReducerClass(Reducer.class);
   currentJob.setNumReduceTasks(0);
   currentJob.waitForCompletion(true);
   ToolUtil.recordJobStatus(null, currentJob, results);
   return results;
 }
Ejemplo n.º 2
0
  public Map<String, Object> run(Map<String, Object> args) throws Exception {
    getConf().setLong("injector.current.time", System.currentTimeMillis());
    Path input;
    Object path = args.get(Nutch.ARG_SEEDDIR);
    if (path instanceof Path) {
      input = (Path) path;
    } else {
      input = new Path(path.toString());
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "inject " + input);
    FileInputFormat.addInputPath(currentJob, input);
    currentJob.setMapperClass(UrlMapper.class);
    currentJob.setMapOutputKeyClass(String.class);
    currentJob.setMapOutputValueClass(WebPage.class);
    currentJob.setOutputFormatClass(GoraOutputFormat.class);

    DataStore<String, WebPage> store =
        StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class);
    GoraOutputFormat.setOutput(currentJob, store, true);

    // NUTCH-1471 Make explicit which datastore class we use
    Class<? extends DataStore<Object, Persistent>> dataStoreClass =
        StorageUtils.getDataStoreClass(currentJob.getConfiguration());
    LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class.");

    currentJob.setReducerClass(Reducer.class);
    currentJob.setNumReduceTasks(0);

    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);

    // NUTCH-1370 Make explicit #URLs injected @runtime
    long urlsInjected =
        currentJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered =
        currentJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "InjectorJob: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    return results;
  }