Example #1
0
  public Map<String, Object> run(Map<String, Object> args) throws Exception {
    String batchId = (String) args.get(Nutch.ARG_BATCH);
    if (batchId != null) {
      getConf().set(GeneratorJob.BATCH_ID, batchId);
    }

    // map to inverted subset due for fetch, sort by score
    Long topN = (Long) args.get(Nutch.ARG_TOPN);
    Long curTime = (Long) args.get(Nutch.ARG_CURTIME);
    if (curTime == null) {
      curTime = System.currentTimeMillis();
    }
    Boolean filter = (Boolean) args.get(Nutch.ARG_FILTER);
    Boolean norm = (Boolean) args.get(Nutch.ARG_NORMALIZE);
    // map to inverted subset due for fetch, sort by score
    getConf().setLong(GENERATOR_CUR_TIME, curTime);
    if (topN != null) getConf().setLong(GENERATOR_TOP_N, topN);
    if (filter != null) getConf().setBoolean(GENERATOR_FILTER, filter);

    getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
    if (norm != null) getConf().setBoolean(GENERATOR_NORMALISE, norm);
    String mode = getConf().get(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
    if (GENERATOR_COUNT_VALUE_HOST.equalsIgnoreCase(mode)) {
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST);
    } else if (GENERATOR_COUNT_VALUE_DOMAIN.equalsIgnoreCase(mode)) {
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_DOMAIN);
    } else {
      LOG.warn(
          "Unknown generator.max.count mode '"
              + mode
              + "', using mode="
              + GENERATOR_COUNT_VALUE_HOST);
      getConf().set(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST);
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
    Collection<WebPage.Field> fields = getFields(currentJob);
    StorageUtils.initMapperJob(
        currentJob,
        fields,
        SelectorEntry.class,
        WebPage.class,
        GeneratorMapper.class,
        SelectorEntryPartitioner.class,
        true);
    StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);
    results.put(BATCH_ID, getConf().get(BATCH_ID));
    long generateCount =
        currentJob.getCounters().findCounter("Generator", "GENERATE_MARK").getValue();
    results.put(GENERATE_COUNT, generateCount);
    return results;
  }