@Override public void write(NutchDocument doc) throws IOException { final SolrInputDocument inputDoc = new SolrInputDocument(); for (final Entry<String, List<String>> e : doc) { for (final String val : e.getValue()) { Object val2 = val; if (e.getKey().equals("content") || e.getKey().equals("title")) { val2 = SolrUtils.stripNonCharCodepoints(val); } inputDoc.addField(solrMapping.mapKey(e.getKey()), val2); String sCopy = solrMapping.mapCopyKey(e.getKey()); if (sCopy != e.getKey()) { inputDoc.addField(sCopy, val2); } } } inputDoc.setDocumentBoost(doc.getScore()); inputDocs.add(inputDoc); if (inputDocs.size() >= commitSize) { try { LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents"); solr.add(inputDocs); } catch (final SolrServerException e) { throw new IOException(e); } inputDocs.clear(); } }
@Override public void open(TaskAttemptContext job) throws IOException { Configuration conf = job.getConfiguration(); solr = SolrUtils.getCommonsHttpSolrServer(conf); commitSize = conf.getInt(SolrConstants.COMMIT_SIZE, 1000); solrMapping = SolrMappingReader.getInstance(conf); }
public void indexSolr( String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { filter = false; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(SolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(SolrConstants.PARAMS, solrParams); } NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info( "SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }