public Map<String, Object> run(Map<String, Object> args) throws Exception { String batchId = (String) args.get(Nutch.ARG_BATCH); if (batchId != null) { getConf().set(GeneratorJob.BATCH_ID, batchId); } // map to inverted subset due for fetch, sort by score Long topN = (Long) args.get(Nutch.ARG_TOPN); Long curTime = (Long) args.get(Nutch.ARG_CURTIME); if (curTime == null) { curTime = System.currentTimeMillis(); } Boolean filter = (Boolean) args.get(Nutch.ARG_FILTER); Boolean norm = (Boolean) args.get(Nutch.ARG_NORMALIZE); // map to inverted subset due for fetch, sort by score getConf().setLong(GENERATOR_CUR_TIME, curTime); if (topN != null) getConf().setLong(GENERATOR_TOP_N, topN); if (filter != null) getConf().setBoolean(GENERATOR_FILTER, filter); getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis()); if (norm != null) getConf().setBoolean(GENERATOR_NORMALISE, norm); String mode = getConf().get(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST); if (GENERATOR_COUNT_VALUE_HOST.equalsIgnoreCase(mode)) { getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); } else if (GENERATOR_COUNT_VALUE_DOMAIN.equalsIgnoreCase(mode)) { getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_DOMAIN); } else { LOG.warn( "Unknown generator.max.count mode '" + mode + "', using mode=" + GENERATOR_COUNT_VALUE_HOST); getConf().set(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST); getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID)); Collection<WebPage.Field> fields = getFields(currentJob); StorageUtils.initMapperJob( currentJob, fields, SelectorEntry.class, WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class, true); StorageUtils.initReducerJob(currentJob, GeneratorReducer.class); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); results.put(BATCH_ID, getConf().get(BATCH_ID)); long generateCount = currentJob.getCounters().findCounter("Generator", "GENERATE_MARK").getValue(); results.put(GENERATE_COUNT, generateCount); return results; }
public Map<String, Object> run(Map<String, Object> args) throws Exception { getConf().setLong("injector.current.time", System.currentTimeMillis()); Path input; Object path = args.get(Nutch.ARG_SEEDDIR); if (path instanceof Path) { input = (Path) path; } else { input = new Path(path.toString()); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "inject " + input); FileInputFormat.addInputPath(currentJob, input); currentJob.setMapperClass(UrlMapper.class); currentJob.setMapOutputKeyClass(String.class); currentJob.setMapOutputValueClass(WebPage.class); currentJob.setOutputFormatClass(GoraOutputFormat.class); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); GoraOutputFormat.setOutput(currentJob, store, true); currentJob.setReducerClass(Reducer.class); currentJob.setNumReduceTasks(0); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); return results; }
public Map<String, Object> run(Map<String, Object> args) throws Exception { getConf().setLong("injector.current.time", System.currentTimeMillis()); Path input; Object path = args.get(Nutch.ARG_SEEDDIR); if (path instanceof Path) { input = (Path) path; } else { input = new Path(path.toString()); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "inject " + input); FileInputFormat.addInputPath(currentJob, input); currentJob.setMapperClass(UrlMapper.class); currentJob.setMapOutputKeyClass(String.class); currentJob.setMapOutputValueClass(WebPage.class); currentJob.setOutputFormatClass(GoraOutputFormat.class); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); GoraOutputFormat.setOutput(currentJob, store, true); // NUTCH-1471 Make explicit which datastore class we use Class<? extends DataStore<Object, Persistent>> dataStoreClass = StorageUtils.getDataStoreClass(currentJob.getConfiguration()); LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class."); currentJob.setReducerClass(Reducer.class); currentJob.setNumReduceTasks(0); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); // NUTCH-1370 Make explicit #URLs injected @runtime long urlsInjected = currentJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = currentJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "InjectorJob: total number of urls injected after normalization and filtering: " + urlsInjected); return results; }