/** * Test method for {@link * edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#merge(edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream)}. */ @Test public void testMerge() { // merge with null TokenStream stream1 = new TokenStream("this"); stream1.append("is", "a", "test", "stream"); stream1.merge(null); assertEquals(5, stream1.getAllTokens().size()); TokenStream stream2 = new TokenStream((String) null); stream1.merge(stream2); assertEquals(5, stream1.getAllTokens().size()); stream2.merge(stream1); assertEquals(5, stream2.getAllTokens().size()); stream1 = null; stream2 = null; // proper merge stream1 = new TokenStream("this"); stream1.append("is", "a"); stream2 = new TokenStream("test"); stream2.append("stream"); stream1.merge(stream2); assertEquals(5, stream1.getAllTokens().size()); assertEquals(5, stream1.getTokenMap().size()); assertEquals(2, stream2.getAllTokens().size()); assertEquals(2, stream2.getTokenMap().size()); assertFalse(stream1.hasPrevious()); for (int i = 0; i < 4; i++) stream1.mergeWithNext(); stream1.reset(); assertEquals("this is a test stream", stream1.next()); stream1 = null; stream2 = null; // self merge stream1 = new TokenStream("this"); stream1.append("is", "a", "test", "stream"); stream2 = new TokenStream("this"); stream2.append("is", "a", "test", "stream"); stream1.merge(stream2); assertEquals(10, stream1.getAllTokens().size()); assertEquals(5, stream1.getTokenMap().size()); assertEquals(5, stream2.getAllTokens().size()); assertEquals(5, stream2.getTokenMap().size()); stream1 = null; stream2 = null; }
/** Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#getTokenMap()}. */ @Test public void testGetTokenMap() { // null string based stream TokenStream stream = new TokenStream((String) null); assertEquals(null, stream.getTokenMap()); stream = null; // empty string stream = new TokenStream(""); assertEquals(null, stream.getTokenMap()); stream = null; // unique tokens stream = new TokenStream("this"); stream.append("is", "a", "test", "string"); Map<String, Integer> smap = getSortedMap(stream.getTokenMap()); assertEquals("[a, is, string, test, this]", smap.keySet().toString()); assertEquals("[1, 1, 1, 1, 1]", smap.values().toString()); stream = null; smap = null; // same token repeated stream = new TokenStream("hello"); stream.append("hello", "hello", "hello", "hello"); smap = getSortedMap(stream.getTokenMap()); assertEquals("[hello]", smap.keySet().toString()); assertEquals("[5]", smap.values().toString()); stream = null; smap = null; // combination stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); smap = getSortedMap(stream.getTokenMap()); assertEquals("[be, not, or, to]", smap.keySet().toString()); assertEquals("[2, 1, 1, 2]", smap.values().toString()); stream = null; smap = null; // with remove stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); stream.remove(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[be, not, or, to]", smap.keySet().toString()); assertEquals("[2, 1, 1, 1]", smap.values().toString()); stream.seekEnd(); stream.previous(); // be stream.previous(); // to stream.remove(); stream.previous(); stream.remove(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[be, or]", smap.keySet().toString()); assertEquals("[2, 1]", smap.values().toString()); stream = null; smap = null; // with merge with previous stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); stream.next(); // at be stream.mergeWithPrevious(); stream.seekEnd(); stream.previous(); stream.mergeWithPrevious(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[not, or, to be]", smap.keySet().toString()); assertEquals("[1, 1, 2]", smap.values().toString()); stream = null; // with merge with next stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); stream.mergeWithNext(); stream.seekEnd(); stream.previous(); stream.previous(); stream.mergeWithNext(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[not, or, to be]", smap.keySet().toString()); assertEquals("[1, 1, 2]", smap.values().toString()); stream = null; }
private static void tokenizeAndIndex( Properties properties, ConcurrentLinkedQueue<WikipediaDocument> queue) throws InterruptedException { /* * Pseudo-code: * 1. Create a thread executor * 2. For each runner, initialize the tokenizer as needed * 3. Keep calling and putting as required */ ExecutorService threadPool = Executors.newFixedThreadPool( Integer.valueOf(properties.get(IndexerConstants.NUM_TOKENIZER_THREADS).toString())); CompletionService<IndexableDocument> pool = new ExecutorCompletionService<IndexableDocument>(threadPool); ThreadPoolExecutor tpe = (ThreadPoolExecutor) threadPool; tokenizerThread = new Thread(new TokenizerRunner(queue, pool, properties)); tokenizerThread.start(); new Thread(new ParserChecker(queue)).start(); // give the tokenizer a head start Thread.sleep(2000); long completed = 0, totalTasks = tpe.getTaskCount(); long remaining = totalTasks - completed; IndexableDocument idoc = null; SharedDictionary docDict = new SharedDictionary(properties, INDEXFIELD.LINK); int currDocId; ThreadedIndexerRunner termRunner = new ThreadedIndexerRunner(properties); SingleIndexerRunner authIdxer = new SingleIndexerRunner(properties, INDEXFIELD.AUTHOR, INDEXFIELD.LINK, docDict, false); // SingleIndexerRunner catIdxer = new SingleIndexerRunner(properties, INDEXFIELD.CATEGORY, // INDEXFIELD.LINK, docDict, false); // SingleIndexerRunner linkIdxer = new SingleIndexerRunner(properties, INDEXFIELD.LINK, // INDEXFIELD.LINK, docDict, true); Map<String, Integer> tokenmap; try { while (remaining > 0) { idoc = pool.take().get(); if (idoc != null) { currDocId = docDict.lookup(idoc.getDocumentIdentifier()); TokenStream stream; try { for (INDEXFIELD fld : INDEXFIELD.values()) { stream = idoc.getStream(fld); if (stream != null) { tokenmap = stream.getTokenMap(); if (tokenmap != null) { switch (fld) { case TERM: termRunner.addToIndex(tokenmap, currDocId); break; case AUTHOR: authIdxer.processTokenMap(currDocId, tokenmap); break; case CATEGORY: // catIdxer.processTokenMap(currDocId, // tokenmap); break; case LINK: // linkIdxer.processTokenMap( // currDocId, tokenmap); break; } } } } } catch (IndexerException e) { // TODO Auto-generated catch block e.printStackTrace(); } } completed++; if (tokenizerThread.isAlive()) totalTasks = tpe.getTaskCount(); remaining = totalTasks - completed; } } catch (ExecutionException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { termRunner.cleanup(); authIdxer.cleanup(); // catIdxer.cleanup(); // linkIdxer.cleanup(); docDict.writeToDisk(); docDict.cleanUp(); } catch (IndexerException e) { // TODO Auto-generated catch block e.printStackTrace(); } while (termRunner.isFinished() && authIdxer.isFinished()) { // do nothing Thread.sleep(1000); } threadPool.shutdown(); }