Пример #1
0
 public Map<String, Object> run(Map<String, Object> args) throws Exception {
   getConf().setLong("injector.current.time", System.currentTimeMillis());
   Path input;
   Object path = args.get(Nutch.ARG_SEEDDIR);
   if (path instanceof Path) {
     input = (Path) path;
   } else {
     input = new Path(path.toString());
   }
   numJobs = 1;
   currentJobNum = 0;
   currentJob = new NutchJob(getConf(), "inject " + input);
   FileInputFormat.addInputPath(currentJob, input);
   currentJob.setMapperClass(UrlMapper.class);
   currentJob.setMapOutputKeyClass(String.class);
   currentJob.setMapOutputValueClass(WebPage.class);
   currentJob.setOutputFormatClass(GoraOutputFormat.class);
   DataStore<String, WebPage> store =
       StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class);
   GoraOutputFormat.setOutput(currentJob, store, true);
   currentJob.setReducerClass(Reducer.class);
   currentJob.setNumReduceTasks(0);
   currentJob.waitForCompletion(true);
   ToolUtil.recordJobStatus(null, currentJob, results);
   return results;
 }
Пример #2
0
  public Map<String, Object> run(Map<String, Object> args) throws Exception {
    String batchId = (String) args.get(Nutch.ARG_BATCH);
    if (batchId != null) {
      getConf().set(GeneratorJob.BATCH_ID, batchId);
    }

    // map to inverted subset due for fetch, sort by score
    Long topN = (Long) args.get(Nutch.ARG_TOPN);
    Long curTime = (Long) args.get(Nutch.ARG_CURTIME);
    if (curTime == null) {
      curTime = System.currentTimeMillis();
    }
    Boolean filter = (Boolean) args.get(Nutch.ARG_FILTER);
    Boolean norm = (Boolean) args.get(Nutch.ARG_NORMALIZE);
    // map to inverted subset due for fetch, sort by score
    getConf().setLong(GENERATOR_CUR_TIME, curTime);
    if (topN != null) getConf().setLong(GENERATOR_TOP_N, topN);
    if (filter != null) getConf().setBoolean(GENERATOR_FILTER, filter);

    getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
    if (norm != null) getConf().setBoolean(GENERATOR_NORMALISE, norm);
    String mode = getConf().get(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
    if (GENERATOR_COUNT_VALUE_HOST.equalsIgnoreCase(mode)) {
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST);
    } else if (GENERATOR_COUNT_VALUE_DOMAIN.equalsIgnoreCase(mode)) {
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_DOMAIN);
    } else {
      LOG.warn(
          "Unknown generator.max.count mode '"
              + mode
              + "', using mode="
              + GENERATOR_COUNT_VALUE_HOST);
      getConf().set(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST);
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
    Collection<WebPage.Field> fields = getFields(currentJob);
    StorageUtils.initMapperJob(
        currentJob,
        fields,
        SelectorEntry.class,
        WebPage.class,
        GeneratorMapper.class,
        SelectorEntryPartitioner.class,
        true);
    StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);
    results.put(BATCH_ID, getConf().get(BATCH_ID));
    long generateCount =
        currentJob.getCounters().findCounter("Generator", "GENERATE_MARK").getValue();
    results.put(GENERATE_COUNT, generateCount);
    return results;
  }
Пример #3
0
 public void inject(Path urlDir) throws Exception {
   SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
   long start = System.currentTimeMillis();
   LOG.info("InjectorJob: starting at " + sdf.format(start));
   LOG.info("InjectorJob: Injecting urlDir: " + urlDir);
   run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir));
   long end = System.currentTimeMillis();
   LOG.info(
       "Injector: finished at "
           + sdf.format(end)
           + ", elapsed: "
           + TimingUtil.elapsedTime(start, end));
 }
Пример #4
0
  public Map<String, Object> run(Map<String, Object> args) throws Exception {
    getConf().setLong("injector.current.time", System.currentTimeMillis());
    Path input;
    Object path = args.get(Nutch.ARG_SEEDDIR);
    if (path instanceof Path) {
      input = (Path) path;
    } else {
      input = new Path(path.toString());
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "inject " + input);
    FileInputFormat.addInputPath(currentJob, input);
    currentJob.setMapperClass(UrlMapper.class);
    currentJob.setMapOutputKeyClass(String.class);
    currentJob.setMapOutputValueClass(WebPage.class);
    currentJob.setOutputFormatClass(GoraOutputFormat.class);

    DataStore<String, WebPage> store =
        StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class);
    GoraOutputFormat.setOutput(currentJob, store, true);

    // NUTCH-1471 Make explicit which datastore class we use
    Class<? extends DataStore<Object, Persistent>> dataStoreClass =
        StorageUtils.getDataStoreClass(currentJob.getConfiguration());
    LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class.");

    currentJob.setReducerClass(Reducer.class);
    currentJob.setNumReduceTasks(0);

    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);

    // NUTCH-1370 Make explicit #URLs injected @runtime
    long urlsInjected =
        currentJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered =
        currentJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "InjectorJob: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    return results;
  }
Пример #5
0
  /**
   * Mark URLs ready for fetching.
   *
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
  public String generate(long topN, long curTime, boolean filter, boolean norm) throws Exception {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("GeneratorJob: starting at " + sdf.format(start));
    LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");
    LOG.info("GeneratorJob: starting");
    LOG.info("GeneratorJob: filtering: " + filter);
    LOG.info("GeneratorJob: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
      LOG.info("GeneratorJob: topN: " + topN);
    }
    Map<String, Object> results =
        run(
            ToolUtil.toArgMap(
                Nutch.ARG_TOPN,
                topN,
                Nutch.ARG_CURTIME,
                curTime,
                Nutch.ARG_FILTER,
                filter,
                Nutch.ARG_NORMALIZE,
                norm));
    String batchId = getConf().get(BATCH_ID);
    long finish = System.currentTimeMillis();
    long generateCount = (Long) results.get(GENERATE_COUNT);
    LOG.info(
        "GeneratorJob: finished at "
            + sdf.format(finish)
            + ", time elapsed: "
            + TimingUtil.elapsedTime(start, finish));
    LOG.info(
        "GeneratorJob: generated batch id: " + batchId + " containing " + generateCount + " URLs");
    if (generateCount == 0) {
      return null;
    }
    return batchId;
  }
Пример #6
0
  public void inject(Path urlDir) throws Exception {
    LOG.info("InjectorJob: starting");
    LOG.info("InjectorJob: urlDir: " + urlDir);

    run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir));
  }