Ejemplo n.º 1
0
  public Map<String, Object> run(Map<String, Object> args) throws Exception {
    String batchId = (String) args.get(Nutch.ARG_BATCH);
    if (batchId != null) {
      getConf().set(GeneratorJob.BATCH_ID, batchId);
    }

    // map to inverted subset due for fetch, sort by score
    Long topN = (Long) args.get(Nutch.ARG_TOPN);
    Long curTime = (Long) args.get(Nutch.ARG_CURTIME);
    if (curTime == null) {
      curTime = System.currentTimeMillis();
    }
    Boolean filter = (Boolean) args.get(Nutch.ARG_FILTER);
    Boolean norm = (Boolean) args.get(Nutch.ARG_NORMALIZE);
    // map to inverted subset due for fetch, sort by score
    getConf().setLong(GENERATOR_CUR_TIME, curTime);
    if (topN != null) getConf().setLong(GENERATOR_TOP_N, topN);
    if (filter != null) getConf().setBoolean(GENERATOR_FILTER, filter);

    getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
    if (norm != null) getConf().setBoolean(GENERATOR_NORMALISE, norm);
    String mode = getConf().get(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
    if (GENERATOR_COUNT_VALUE_HOST.equalsIgnoreCase(mode)) {
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST);
    } else if (GENERATOR_COUNT_VALUE_DOMAIN.equalsIgnoreCase(mode)) {
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_DOMAIN);
    } else {
      LOG.warn(
          "Unknown generator.max.count mode '"
              + mode
              + "', using mode="
              + GENERATOR_COUNT_VALUE_HOST);
      getConf().set(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST);
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
    Collection<WebPage.Field> fields = getFields(currentJob);
    StorageUtils.initMapperJob(
        currentJob,
        fields,
        SelectorEntry.class,
        WebPage.class,
        GeneratorMapper.class,
        SelectorEntryPartitioner.class,
        true);
    StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);
    results.put(BATCH_ID, getConf().get(BATCH_ID));
    long generateCount =
        currentJob.getCounters().findCounter("Generator", "GENERATE_MARK").getValue();
    results.put(GENERATE_COUNT, generateCount);
    return results;
  }
Ejemplo n.º 2
0
 public Map<String, Object> run(Map<String, Object> args) throws Exception {
   getConf().setLong("injector.current.time", System.currentTimeMillis());
   Path input;
   Object path = args.get(Nutch.ARG_SEEDDIR);
   if (path instanceof Path) {
     input = (Path) path;
   } else {
     input = new Path(path.toString());
   }
   numJobs = 1;
   currentJobNum = 0;
   currentJob = new NutchJob(getConf(), "inject " + input);
   FileInputFormat.addInputPath(currentJob, input);
   currentJob.setMapperClass(UrlMapper.class);
   currentJob.setMapOutputKeyClass(String.class);
   currentJob.setMapOutputValueClass(WebPage.class);
   currentJob.setOutputFormatClass(GoraOutputFormat.class);
   DataStore<String, WebPage> store =
       StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class);
   GoraOutputFormat.setOutput(currentJob, store, true);
   currentJob.setReducerClass(Reducer.class);
   currentJob.setNumReduceTasks(0);
   currentJob.waitForCompletion(true);
   ToolUtil.recordJobStatus(null, currentJob, results);
   return results;
 }
Ejemplo n.º 3
0
  public Map<String, Object> run(Map<String, Object> args) throws Exception {
    getConf().setLong("injector.current.time", System.currentTimeMillis());
    Path input;
    Object path = args.get(Nutch.ARG_SEEDDIR);
    if (path instanceof Path) {
      input = (Path) path;
    } else {
      input = new Path(path.toString());
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "inject " + input);
    FileInputFormat.addInputPath(currentJob, input);
    currentJob.setMapperClass(UrlMapper.class);
    currentJob.setMapOutputKeyClass(String.class);
    currentJob.setMapOutputValueClass(WebPage.class);
    currentJob.setOutputFormatClass(GoraOutputFormat.class);

    DataStore<String, WebPage> store =
        StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class);
    GoraOutputFormat.setOutput(currentJob, store, true);

    // NUTCH-1471 Make explicit which datastore class we use
    Class<? extends DataStore<Object, Persistent>> dataStoreClass =
        StorageUtils.getDataStoreClass(currentJob.getConfiguration());
    LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class.");

    currentJob.setReducerClass(Reducer.class);
    currentJob.setNumReduceTasks(0);

    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);

    // NUTCH-1370 Make explicit #URLs injected @runtime
    long urlsInjected =
        currentJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered =
        currentJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "InjectorJob: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    return results;
  }