public Map<String, Object> run(Map<String, Object> args) throws Exception { getConf().setLong("injector.current.time", System.currentTimeMillis()); Path input; Object path = args.get(Nutch.ARG_SEEDDIR); if (path instanceof Path) { input = (Path) path; } else { input = new Path(path.toString()); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "inject " + input); FileInputFormat.addInputPath(currentJob, input); currentJob.setMapperClass(UrlMapper.class); currentJob.setMapOutputKeyClass(String.class); currentJob.setMapOutputValueClass(WebPage.class); currentJob.setOutputFormatClass(GoraOutputFormat.class); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); GoraOutputFormat.setOutput(currentJob, store, true); currentJob.setReducerClass(Reducer.class); currentJob.setNumReduceTasks(0); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); return results; }
public Map<String, Object> run(Map<String, Object> args) throws Exception { getConf().setLong("injector.current.time", System.currentTimeMillis()); Path input; Object path = args.get(Nutch.ARG_SEEDDIR); if (path instanceof Path) { input = (Path) path; } else { input = new Path(path.toString()); } numJobs = 1; currentJobNum = 0; currentJob = new NutchJob(getConf(), "inject " + input); FileInputFormat.addInputPath(currentJob, input); currentJob.setMapperClass(UrlMapper.class); currentJob.setMapOutputKeyClass(String.class); currentJob.setMapOutputValueClass(WebPage.class); currentJob.setOutputFormatClass(GoraOutputFormat.class); DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class, WebPage.class); GoraOutputFormat.setOutput(currentJob, store, true); // NUTCH-1471 Make explicit which datastore class we use Class<? extends DataStore<Object, Persistent>> dataStoreClass = StorageUtils.getDataStoreClass(currentJob.getConfiguration()); LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class."); currentJob.setReducerClass(Reducer.class); currentJob.setNumReduceTasks(0); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); // NUTCH-1370 Make explicit #URLs injected @runtime long urlsInjected = currentJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = currentJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "InjectorJob: total number of urls injected after normalization and filtering: " + urlsInjected); return results; }