@Override public void apply(TokenStream stream) throws TokenizerException { // TODO Auto-generated method stub if (stream != null) { String token; Stemmer s; while (stream.hasNext()) { token = stream.next(); if (token != null) { token = token.toLowerCase(); if (isLettersOnly(token)) { s = new Stemmer(); for (char c : token.toCharArray()) { s.add(c); } s.stem(); stream.set(s.toString()); } } } stream.reset(); } }
/** Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#reset()}. */ @Test public void testReset() { // empty / null TokenStream stream = new TokenStream((String) null); stream.reset(); assertNull(stream.next()); stream = null; stream = new TokenStream(""); stream.reset(); assertNull(stream.next()); stream = null; // positive run stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.next(); stream.reset(); assertEquals("this", stream.next()); stream = null; }
@Override public void apply(TokenStream stream) throws TokenizerException { if (stream != null) { String token; while (stream.hasNext()) { token = stream.next(); stream.previous(); if (token != null) { token = replaceDate(token); if (token.isEmpty()) stream.remove(); else { stream.set(token); stream.next(); } } } } stream.reset(); }
/** * Test method for {@link * edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#merge(edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream)}. */ @Test public void testMerge() { // merge with null TokenStream stream1 = new TokenStream("this"); stream1.append("is", "a", "test", "stream"); stream1.merge(null); assertEquals(5, stream1.getAllTokens().size()); TokenStream stream2 = new TokenStream((String) null); stream1.merge(stream2); assertEquals(5, stream1.getAllTokens().size()); stream2.merge(stream1); assertEquals(5, stream2.getAllTokens().size()); stream1 = null; stream2 = null; // proper merge stream1 = new TokenStream("this"); stream1.append("is", "a"); stream2 = new TokenStream("test"); stream2.append("stream"); stream1.merge(stream2); assertEquals(5, stream1.getAllTokens().size()); assertEquals(5, stream1.getTokenMap().size()); assertEquals(2, stream2.getAllTokens().size()); assertEquals(2, stream2.getTokenMap().size()); assertFalse(stream1.hasPrevious()); for (int i = 0; i < 4; i++) stream1.mergeWithNext(); stream1.reset(); assertEquals("this is a test stream", stream1.next()); stream1 = null; stream2 = null; // self merge stream1 = new TokenStream("this"); stream1.append("is", "a", "test", "stream"); stream2 = new TokenStream("this"); stream2.append("is", "a", "test", "stream"); stream1.merge(stream2); assertEquals(10, stream1.getAllTokens().size()); assertEquals(5, stream1.getTokenMap().size()); assertEquals(5, stream2.getAllTokens().size()); assertEquals(5, stream2.getTokenMap().size()); stream1 = null; stream2 = null; }
/** * Test method for {@link * edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#set(java.lang.String[])}. */ @Test public void testSet() { // set on null and empty streams TokenStream stream = new TokenStream((String) null); stream.set("invalid"); assertNull(stream.getAllTokens()); stream = null; stream = new TokenStream(""); stream.set("invalid"); assertNull(stream.getAllTokens()); stream = null; // valid posiiton, null or empty tokens stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.set((String) null); assertEquals("this", stream.next()); stream.previous(); stream.set(""); assertEquals("this", stream.next()); stream = null; // valid new token, invalid position stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); stream.set("valid"); assertEquals("stream", stream.previous()); stream = null; // correct set, single token stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.set("that"); assertEquals(5, stream.getAllTokens().size()); assertEquals("that", stream.next()); stream = null; // correct set, multiple tokens at the end stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); stream.previous(); stream.set("of", "the", "set", "method"); assertEquals(8, stream.getAllTokens().size()); assertEquals("method", stream.next()); stream = null; // correct set, multiple tokens at the start stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.set("you", "think", "styx"); assertEquals(7, stream.getAllTokens().size()); assertEquals("styx", stream.next()); stream = null; // correct set, multiple tokens in the middle stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); stream.previous(); stream.previous(); stream.set("really", "interesting"); assertEquals(6, stream.getAllTokens().size()); assertEquals("interesting", stream.next()); assertEquals("stream", stream.next()); assertFalse(stream.hasNext()); stream = null; }
/** * Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#mergeWithNext()}. */ @Test public void testMergeWithNext() { // everything is null, empty TokenStream stream = new TokenStream((String) null); assertFalse(stream.mergeWithNext()); stream = null; stream = new TokenStream(""); assertFalse(stream.mergeWithNext()); stream = null; // next is null stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); assertFalse(stream.mergeWithNext()); // proper merge stream.reset(); assertTrue(stream.mergeWithNext()); assertEquals("this is", stream.next()); assertEquals(4, stream.getAllTokens().size()); stream = null; // full merge - reverse stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); stream.previous(); stream.previous(); assertTrue(stream.mergeWithNext()); assertEquals("test stream", stream.next()); stream.previous(); stream.previous(); assertTrue(stream.mergeWithNext()); assertEquals("a test stream", stream.next()); stream.previous(); stream.previous(); assertTrue(stream.mergeWithNext()); assertEquals("is a test stream", stream.next()); stream.previous(); stream.previous(); assertTrue(stream.mergeWithNext()); assertEquals("this is a test stream", stream.next()); stream.previous(); assertFalse(stream.mergeWithNext()); stream = null; // full merge - forward stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); assertTrue(stream.mergeWithNext()); assertEquals("this is", stream.next()); stream.previous(); assertTrue(stream.mergeWithNext()); assertEquals("this is a", stream.next()); stream.previous(); assertTrue(stream.mergeWithNext()); assertEquals("this is a test", stream.next()); stream.previous(); assertTrue(stream.mergeWithNext()); assertEquals("this is a test stream", stream.next()); assertFalse(stream.mergeWithNext()); stream = null; }
/** Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#getTokenMap()}. */ @Test public void testGetTokenMap() { // null string based stream TokenStream stream = new TokenStream((String) null); assertEquals(null, stream.getTokenMap()); stream = null; // empty string stream = new TokenStream(""); assertEquals(null, stream.getTokenMap()); stream = null; // unique tokens stream = new TokenStream("this"); stream.append("is", "a", "test", "string"); Map<String, Integer> smap = getSortedMap(stream.getTokenMap()); assertEquals("[a, is, string, test, this]", smap.keySet().toString()); assertEquals("[1, 1, 1, 1, 1]", smap.values().toString()); stream = null; smap = null; // same token repeated stream = new TokenStream("hello"); stream.append("hello", "hello", "hello", "hello"); smap = getSortedMap(stream.getTokenMap()); assertEquals("[hello]", smap.keySet().toString()); assertEquals("[5]", smap.values().toString()); stream = null; smap = null; // combination stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); smap = getSortedMap(stream.getTokenMap()); assertEquals("[be, not, or, to]", smap.keySet().toString()); assertEquals("[2, 1, 1, 2]", smap.values().toString()); stream = null; smap = null; // with remove stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); stream.remove(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[be, not, or, to]", smap.keySet().toString()); assertEquals("[2, 1, 1, 1]", smap.values().toString()); stream.seekEnd(); stream.previous(); // be stream.previous(); // to stream.remove(); stream.previous(); stream.remove(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[be, or]", smap.keySet().toString()); assertEquals("[2, 1]", smap.values().toString()); stream = null; smap = null; // with merge with previous stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); stream.next(); // at be stream.mergeWithPrevious(); stream.seekEnd(); stream.previous(); stream.mergeWithPrevious(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[not, or, to be]", smap.keySet().toString()); assertEquals("[1, 1, 2]", smap.values().toString()); stream = null; // with merge with next stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); stream.mergeWithNext(); stream.seekEnd(); stream.previous(); stream.previous(); stream.mergeWithNext(); smap = getSortedMap(stream.getTokenMap()); assertEquals("[not, or, to be]", smap.keySet().toString()); assertEquals("[1, 1, 2]", smap.values().toString()); stream = null; }
/** Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#remove()}. */ @Test public void testRemove() { // remove on null TokenStream stream = new TokenStream((String) null); stream.remove(); assertNull(stream.getAllTokens()); stream = null; // remove on empty stream = new TokenStream(""); stream.remove(); assertNull(stream.getAllTokens()); stream = null; // remove till empty stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); int currcnt = 5; while (stream.hasNext()) { assertEquals(currcnt--, stream.getAllTokens().size()); stream.remove(); } stream = null; // remove from invalid position stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); stream.remove(); assertEquals(5, stream.getAllTokens().size()); stream = null; }
/** Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#previous()}. */ @Test public void testPrevious() { // null TokenStream stream = new TokenStream((String) null); assertNull(stream.previous()); stream = null; // empty str stream = new TokenStream(""); assertNull(stream.previous()); stream = null; // reverse iteration stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); assertEquals("stream", stream.previous()); assertEquals("test", stream.previous()); assertEquals("a", stream.previous()); assertEquals("is", stream.previous()); assertEquals("this", stream.previous()); assertNull(stream.previous()); stream = null; // fwd and reverse stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.seekEnd(); assertEquals("stream", stream.previous()); stream.next(); assertEquals("stream", stream.previous()); assertEquals("test", stream.previous()); assertEquals("a", stream.previous()); stream.reset(); assertEquals("this", stream.next()); stream = null; // with remove stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.remove(); stream.next(); assertEquals("is", stream.previous()); stream.next(); stream.remove(); assertEquals("is", stream.previous()); stream = null; // with merge with previous stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.next(); stream.mergeWithPrevious(); assertNull(stream.previous()); stream.next(); assertEquals("this is", stream.previous()); stream = null; // with merge with next stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.mergeWithNext(); assertNull(stream.previous()); stream.next(); assertEquals("this is", stream.previous()); stream = null; }
/** Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#hasPrevious()}. */ @Test public void testHasPrevious() { // null TokenStream stream = new TokenStream((String) null); assertEquals(false, stream.hasPrevious()); stream = null; // empty stream = new TokenStream(""); assertEquals(false, stream.hasPrevious()); stream = null; // some text and iteration stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); assertFalse(stream.hasPrevious()); // start of stream stream.seekEnd(); assertTrue(stream.hasPrevious()); stream.previous(); // after this assertTrue(stream.hasPrevious()); stream.previous(); // after is assertTrue(stream.hasPrevious()); stream.previous(); // after a assertTrue(stream.hasPrevious()); stream.previous(); // after test assertTrue(stream.hasPrevious()); stream.previous(); // after stream assertFalse(stream.hasPrevious()); stream = null; // with seek stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.reset(); assertFalse(stream.hasPrevious()); stream = null; // forward and reverse stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.next(); assertTrue(stream.hasPrevious()); stream.previous(); assertFalse(stream.hasPrevious()); stream = null; // with remove stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.remove(); assertFalse(stream.hasPrevious()); stream = null; // with merge with previous stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.next(); stream.mergeWithPrevious(); assertFalse(stream.hasPrevious()); stream = null; // with merge with next stream = new TokenStream("this"); stream.append("is", "a", "test", "stream"); stream.mergeWithNext(); assertFalse(stream.hasPrevious()); stream = null; }
/** * Test method for {@link * edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#append(java.lang.String[])}. */ @SuppressWarnings("deprecation") @Test public void testAppend() { // appending null TokenStream stream = new TokenStream("test"); stream.append((String[]) null); assertEquals(new Object[] {"test"}, stream.getAllTokens().toArray()); stream = null; // appending empty string stream = new TokenStream("test"); stream.append(""); assertEquals(new Object[] {"test"}, stream.getAllTokens().toArray()); stream = null; // one token stream = new TokenStream("test"); stream.append("string"); assertEquals(new Object[] {"test", "string"}, stream.getAllTokens().toArray()); stream = null; // multiple tokens stream = new TokenStream("test"); stream.append("string", "with", "multiple", "tokens"); assertEquals( new Object[] {"test", "string", "with", "multiple", "tokens"}, stream.getAllTokens().toArray()); stream = null; // intermediate nulls and emptys stream = new TokenStream("test"); stream.append("string", "with", null, "and", "", "tokens"); assertEquals( new Object[] {"test", "string", "with", "and", "tokens"}, stream.getAllTokens().toArray()); stream = null; }
/** * Test method for {@link * edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#query(java.lang.String)}. */ @Test public void testQuery() { // null string based stream TokenStream stream = new TokenStream((String) null); assertEquals(0, stream.query("test")); stream = null; // empty string stream = new TokenStream(""); assertEquals(0, stream.query("test")); stream = null; // unique tokens stream = new TokenStream("this"); stream.append("is", "a", "test", "string"); assertEquals(1, stream.query("test")); assertEquals(0, stream.query("hello")); stream = null; // same token repeated stream = new TokenStream("hello"); stream.append("hello", "hello", "hello", "hello"); assertEquals(0, stream.query("test")); assertEquals(5, stream.query("hello")); stream = null; // combination stream = new TokenStream("to"); stream.append("be", "or", "not", "to", "be"); assertEquals(2, stream.query("be")); assertEquals(1, stream.query("not")); assertEquals(0, stream.query("test")); stream = null; // with remove stream = new TokenStream("this"); stream.append("is", "a", "test", "string"); stream.remove(); // this removed assertEquals(0, stream.query("this")); stream = null; // with merge with previous stream = new TokenStream("this"); stream.append("is", "a", "test", "string"); stream.next(); stream.mergeWithPrevious(); assertEquals(0, stream.query("this")); assertEquals(1, stream.query("this is")); stream = null; // with merge with next stream = new TokenStream("this"); stream.append("is", "a", "test", "string"); stream.mergeWithNext(); assertEquals(0, stream.query("this")); assertEquals(1, stream.query("this is")); stream = null; }
private static void tokenizeAndIndex( Properties properties, ConcurrentLinkedQueue<WikipediaDocument> queue) throws InterruptedException { /* * Pseudo-code: * 1. Create a thread executor * 2. For each runner, initialize the tokenizer as needed * 3. Keep calling and putting as required */ ExecutorService threadPool = Executors.newFixedThreadPool( Integer.valueOf(properties.get(IndexerConstants.NUM_TOKENIZER_THREADS).toString())); CompletionService<IndexableDocument> pool = new ExecutorCompletionService<IndexableDocument>(threadPool); ThreadPoolExecutor tpe = (ThreadPoolExecutor) threadPool; tokenizerThread = new Thread(new TokenizerRunner(queue, pool, properties)); tokenizerThread.start(); new Thread(new ParserChecker(queue)).start(); // give the tokenizer a head start Thread.sleep(2000); long completed = 0, totalTasks = tpe.getTaskCount(); long remaining = totalTasks - completed; IndexableDocument idoc = null; SharedDictionary docDict = new SharedDictionary(properties, INDEXFIELD.LINK); int currDocId; ThreadedIndexerRunner termRunner = new ThreadedIndexerRunner(properties); SingleIndexerRunner authIdxer = new SingleIndexerRunner(properties, INDEXFIELD.AUTHOR, INDEXFIELD.LINK, docDict, false); // SingleIndexerRunner catIdxer = new SingleIndexerRunner(properties, INDEXFIELD.CATEGORY, // INDEXFIELD.LINK, docDict, false); // SingleIndexerRunner linkIdxer = new SingleIndexerRunner(properties, INDEXFIELD.LINK, // INDEXFIELD.LINK, docDict, true); Map<String, Integer> tokenmap; try { while (remaining > 0) { idoc = pool.take().get(); if (idoc != null) { currDocId = docDict.lookup(idoc.getDocumentIdentifier()); TokenStream stream; try { for (INDEXFIELD fld : INDEXFIELD.values()) { stream = idoc.getStream(fld); if (stream != null) { tokenmap = stream.getTokenMap(); if (tokenmap != null) { switch (fld) { case TERM: termRunner.addToIndex(tokenmap, currDocId); break; case AUTHOR: authIdxer.processTokenMap(currDocId, tokenmap); break; case CATEGORY: // catIdxer.processTokenMap(currDocId, // tokenmap); break; case LINK: // linkIdxer.processTokenMap( // currDocId, tokenmap); break; } } } } } catch (IndexerException e) { // TODO Auto-generated catch block e.printStackTrace(); } } completed++; if (tokenizerThread.isAlive()) totalTasks = tpe.getTaskCount(); remaining = totalTasks - completed; } } catch (ExecutionException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { termRunner.cleanup(); authIdxer.cleanup(); // catIdxer.cleanup(); // linkIdxer.cleanup(); docDict.writeToDisk(); docDict.cleanUp(); } catch (IndexerException e) { // TODO Auto-generated catch block e.printStackTrace(); } while (termRunner.isFinished() && authIdxer.isFinished()) { // do nothing Thread.sleep(1000); } threadPool.shutdown(); }