public Map<String, Object> run(Map<String, Object> args) throws Exception { getConf().setLong("injector.current.time", System.currentTimeMillis()); Path input; Object path = args.get(Nutch.ARG_SEEDDIR); if (path instanceof Path) { input = (Path) path; } else { input = new Path(path.toString()); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "inject " + input); FileInputFormat.addInputPath(currentJob, input); currentJob.setMapperClass(UrlMapper.class); currentJob.setMapOutputKeyClass(String.class); currentJob.setMapOutputValueClass(WebPage.class); currentJob.setOutputFormatClass(GoraOutputFormat.class); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); GoraOutputFormat.setOutput(currentJob, store, true); currentJob.setReducerClass(Reducer.class); currentJob.setNumReduceTasks(0); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); return results; }
public Map<String, Object> run(Map<String, Object> args) throws Exception { String batchId = (String) args.get(Nutch.ARG_BATCH); if (batchId != null) { getConf().set(GeneratorJob.BATCH_ID, batchId); } // map to inverted subset due for fetch, sort by score Long topN = (Long) args.get(Nutch.ARG_TOPN); Long curTime = (Long) args.get(Nutch.ARG_CURTIME); if (curTime == null) { curTime = System.currentTimeMillis(); } Boolean filter = (Boolean) args.get(Nutch.ARG_FILTER); Boolean norm = (Boolean) args.get(Nutch.ARG_NORMALIZE); // map to inverted subset due for fetch, sort by score getConf().setLong(GENERATOR_CUR_TIME, curTime); if (topN != null) getConf().setLong(GENERATOR_TOP_N, topN); if (filter != null) getConf().setBoolean(GENERATOR_FILTER, filter); getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis()); if (norm != null) getConf().setBoolean(GENERATOR_NORMALISE, norm); String mode = getConf().get(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST); if (GENERATOR_COUNT_VALUE_HOST.equalsIgnoreCase(mode)) { getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); } else if (GENERATOR_COUNT_VALUE_DOMAIN.equalsIgnoreCase(mode)) { getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_DOMAIN); } else { LOG.warn( "Unknown generator.max.count mode '" + mode + "', using mode=" + GENERATOR_COUNT_VALUE_HOST); getConf().set(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST); getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID)); Collection<WebPage.Field> fields = getFields(currentJob); StorageUtils.initMapperJob( currentJob, fields, SelectorEntry.class, WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class, true); StorageUtils.initReducerJob(currentJob, GeneratorReducer.class); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); results.put(BATCH_ID, getConf().get(BATCH_ID)); long generateCount = currentJob.getCounters().findCounter("Generator", "GENERATE_MARK").getValue(); results.put(GENERATE_COUNT, generateCount); return results; }
public void inject(Path urlDir) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("InjectorJob: starting at " + sdf.format(start)); LOG.info("InjectorJob: Injecting urlDir: " + urlDir); run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir)); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public Map<String, Object> run(Map<String, Object> args) throws Exception { getConf().setLong("injector.current.time", System.currentTimeMillis()); Path input; Object path = args.get(Nutch.ARG_SEEDDIR); if (path instanceof Path) { input = (Path) path; } else { input = new Path(path.toString()); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "inject " + input); FileInputFormat.addInputPath(currentJob, input); currentJob.setMapperClass(UrlMapper.class); currentJob.setMapOutputKeyClass(String.class); currentJob.setMapOutputValueClass(WebPage.class); currentJob.setOutputFormatClass(GoraOutputFormat.class); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); GoraOutputFormat.setOutput(currentJob, store, true); // NUTCH-1471 Make explicit which datastore class we use Class<? extends DataStore<Object, Persistent>> dataStoreClass = StorageUtils.getDataStoreClass(currentJob.getConfiguration()); LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class."); currentJob.setReducerClass(Reducer.class); currentJob.setNumReduceTasks(0); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); // NUTCH-1370 Make explicit #URLs injected @runtime long urlsInjected = currentJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = currentJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "InjectorJob: total number of urls injected after normalization and filtering: " + urlsInjected); return results; }
/** * Mark URLs ready for fetching. * * @throws ClassNotFoundException * @throws InterruptedException */ public String generate(long topN, long curTime, boolean filter, boolean norm) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("GeneratorJob: starting at " + sdf.format(start)); LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch."); LOG.info("GeneratorJob: starting"); LOG.info("GeneratorJob: filtering: " + filter); LOG.info("GeneratorJob: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("GeneratorJob: topN: " + topN); } Map<String, Object> results = run( ToolUtil.toArgMap( Nutch.ARG_TOPN, topN, Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter, Nutch.ARG_NORMALIZE, norm)); String batchId = getConf().get(BATCH_ID); long finish = System.currentTimeMillis(); long generateCount = (Long) results.get(GENERATE_COUNT); LOG.info( "GeneratorJob: finished at " + sdf.format(finish) + ", time elapsed: " + TimingUtil.elapsedTime(start, finish)); LOG.info( "GeneratorJob: generated batch id: " + batchId + " containing " + generateCount + " URLs"); if (generateCount == 0) { return null; } return batchId; }
public void inject(Path urlDir) throws Exception { LOG.info("InjectorJob: starting"); LOG.info("InjectorJob: urlDir: " + urlDir); run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir)); }