private static Map<INDEXFIELD, Tokenizer> initMap(Properties props) { HashMap<INDEXFIELD, Tokenizer> map = new HashMap<INDEXFIELD, Tokenizer>(INDEXFIELD.values().length); TokenizerFactory fact = TokenizerFactory.getInstance(props); for (INDEXFIELD fld : INDEXFIELD.values()) { map.put(fld, fact.getTokenizer(fld)); } return map; }
private static void tokenizeAndIndex( Properties properties, ConcurrentLinkedQueue<WikipediaDocument> queue) throws InterruptedException { /* * Pseudo-code: * 1. Create a thread executor * 2. For each runner, initialize the tokenizer as needed * 3. Keep calling and putting as required */ ExecutorService threadPool = Executors.newFixedThreadPool( Integer.valueOf(properties.get(IndexerConstants.NUM_TOKENIZER_THREADS).toString())); CompletionService<IndexableDocument> pool = new ExecutorCompletionService<IndexableDocument>(threadPool); ThreadPoolExecutor tpe = (ThreadPoolExecutor) threadPool; tokenizerThread = new Thread(new TokenizerRunner(queue, pool, properties)); tokenizerThread.start(); new Thread(new ParserChecker(queue)).start(); // give the tokenizer a head start Thread.sleep(2000); long completed = 0, totalTasks = tpe.getTaskCount(); long remaining = totalTasks - completed; IndexableDocument idoc = null; SharedDictionary docDict = new SharedDictionary(properties, INDEXFIELD.LINK); int currDocId; ThreadedIndexerRunner termRunner = new ThreadedIndexerRunner(properties); SingleIndexerRunner authIdxer = new SingleIndexerRunner(properties, INDEXFIELD.AUTHOR, INDEXFIELD.LINK, docDict, false); // SingleIndexerRunner catIdxer = new SingleIndexerRunner(properties, INDEXFIELD.CATEGORY, // INDEXFIELD.LINK, docDict, false); // SingleIndexerRunner linkIdxer = new SingleIndexerRunner(properties, INDEXFIELD.LINK, // INDEXFIELD.LINK, docDict, true); Map<String, Integer> tokenmap; try { while (remaining > 0) { idoc = pool.take().get(); if (idoc != null) { currDocId = docDict.lookup(idoc.getDocumentIdentifier()); TokenStream stream; try { for (INDEXFIELD fld : INDEXFIELD.values()) { stream = idoc.getStream(fld); if (stream != null) { tokenmap = stream.getTokenMap(); if (tokenmap != null) { switch (fld) { case TERM: termRunner.addToIndex(tokenmap, currDocId); break; case AUTHOR: authIdxer.processTokenMap(currDocId, tokenmap); break; case CATEGORY: // catIdxer.processTokenMap(currDocId, // tokenmap); break; case LINK: // linkIdxer.processTokenMap( // currDocId, tokenmap); break; } } } } } catch (IndexerException e) { // TODO Auto-generated catch block e.printStackTrace(); } } completed++; if (tokenizerThread.isAlive()) totalTasks = tpe.getTaskCount(); remaining = totalTasks - completed; } } catch (ExecutionException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { termRunner.cleanup(); authIdxer.cleanup(); // catIdxer.cleanup(); // linkIdxer.cleanup(); docDict.writeToDisk(); docDict.cleanUp(); } catch (IndexerException e) { // TODO Auto-generated catch block e.printStackTrace(); } while (termRunner.isFinished() && authIdxer.isFinished()) { // do nothing Thread.sleep(1000); } threadPool.shutdown(); }