public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public void inject(Path urlDir) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("InjectorJob: starting at " + sdf.format(start)); LOG.info("InjectorJob: Injecting urlDir: " + urlDir); run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir)); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
/** * Mark URLs ready for fetching. * * @throws ClassNotFoundException * @throws InterruptedException */ public String generate(long topN, long curTime, boolean filter, boolean norm) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("GeneratorJob: starting at " + sdf.format(start)); LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch."); LOG.info("GeneratorJob: starting"); LOG.info("GeneratorJob: filtering: " + filter); LOG.info("GeneratorJob: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("GeneratorJob: topN: " + topN); } Map<String, Object> results = run( ToolUtil.toArgMap( Nutch.ARG_TOPN, topN, Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter, Nutch.ARG_NORMALIZE, norm)); String batchId = getConf().get(BATCH_ID); long finish = System.currentTimeMillis(); long generateCount = (Long) results.get(GENERATE_COUNT); LOG.info( "GeneratorJob: finished at " + sdf.format(finish) + ", time elapsed: " + TimingUtil.elapsedTime(start, finish)); LOG.info( "GeneratorJob: generated batch id: " + batchId + " containing " + generateCount + " URLs"); if (generateCount == 0) { return null; } return batchId; }
public void indexSolr( String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { filter = false; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(SolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(SolrConstants.PARAMS, solrParams); } NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info( "SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }