/**
   * Test method for {@link
   * edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#merge(edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream)}.
   */
  @Test
  public void testMerge() {
    // merge with null
    TokenStream stream1 = new TokenStream("this");
    stream1.append("is", "a", "test", "stream");
    stream1.merge(null);
    assertEquals(5, stream1.getAllTokens().size());

    TokenStream stream2 = new TokenStream((String) null);
    stream1.merge(stream2);
    assertEquals(5, stream1.getAllTokens().size());

    stream2.merge(stream1);
    assertEquals(5, stream2.getAllTokens().size());
    stream1 = null;
    stream2 = null;

    // proper merge
    stream1 = new TokenStream("this");
    stream1.append("is", "a");
    stream2 = new TokenStream("test");
    stream2.append("stream");

    stream1.merge(stream2);
    assertEquals(5, stream1.getAllTokens().size());
    assertEquals(5, stream1.getTokenMap().size());
    assertEquals(2, stream2.getAllTokens().size());
    assertEquals(2, stream2.getTokenMap().size());
    assertFalse(stream1.hasPrevious());

    for (int i = 0; i < 4; i++) stream1.mergeWithNext();

    stream1.reset();
    assertEquals("this is a test stream", stream1.next());
    stream1 = null;
    stream2 = null;

    // self merge
    stream1 = new TokenStream("this");
    stream1.append("is", "a", "test", "stream");
    stream2 = new TokenStream("this");
    stream2.append("is", "a", "test", "stream");
    stream1.merge(stream2);
    assertEquals(10, stream1.getAllTokens().size());
    assertEquals(5, stream1.getTokenMap().size());
    assertEquals(5, stream2.getAllTokens().size());
    assertEquals(5, stream2.getTokenMap().size());
    stream1 = null;
    stream2 = null;
  }
  /** Test method for {@link edu.buffalo.cse.ir.wikiindexer.tokenizer.TokenStream#getTokenMap()}. */
  @Test
  public void testGetTokenMap() {

    // null string based stream
    TokenStream stream = new TokenStream((String) null);
    assertEquals(null, stream.getTokenMap());
    stream = null;

    // empty string
    stream = new TokenStream("");
    assertEquals(null, stream.getTokenMap());
    stream = null;

    // unique tokens
    stream = new TokenStream("this");
    stream.append("is", "a", "test", "string");
    Map<String, Integer> smap = getSortedMap(stream.getTokenMap());
    assertEquals("[a, is, string, test, this]", smap.keySet().toString());
    assertEquals("[1, 1, 1, 1, 1]", smap.values().toString());
    stream = null;
    smap = null;

    // same token repeated
    stream = new TokenStream("hello");
    stream.append("hello", "hello", "hello", "hello");
    smap = getSortedMap(stream.getTokenMap());
    assertEquals("[hello]", smap.keySet().toString());
    assertEquals("[5]", smap.values().toString());
    stream = null;
    smap = null;

    // combination
    stream = new TokenStream("to");
    stream.append("be", "or", "not", "to", "be");
    smap = getSortedMap(stream.getTokenMap());
    assertEquals("[be, not, or, to]", smap.keySet().toString());
    assertEquals("[2, 1, 1, 2]", smap.values().toString());
    stream = null;
    smap = null;

    // with remove
    stream = new TokenStream("to");
    stream.append("be", "or", "not", "to", "be");
    stream.remove();
    smap = getSortedMap(stream.getTokenMap());
    assertEquals("[be, not, or, to]", smap.keySet().toString());
    assertEquals("[2, 1, 1, 1]", smap.values().toString());
    stream.seekEnd();
    stream.previous(); // be
    stream.previous(); // to
    stream.remove();
    stream.previous();
    stream.remove();
    smap = getSortedMap(stream.getTokenMap());
    assertEquals("[be, or]", smap.keySet().toString());
    assertEquals("[2, 1]", smap.values().toString());
    stream = null;
    smap = null;

    // with merge with previous
    stream = new TokenStream("to");
    stream.append("be", "or", "not", "to", "be");
    stream.next(); // at be
    stream.mergeWithPrevious();
    stream.seekEnd();
    stream.previous();
    stream.mergeWithPrevious();
    smap = getSortedMap(stream.getTokenMap());
    assertEquals("[not, or, to be]", smap.keySet().toString());
    assertEquals("[1, 1, 2]", smap.values().toString());
    stream = null;

    // with merge with next
    stream = new TokenStream("to");
    stream.append("be", "or", "not", "to", "be");
    stream.mergeWithNext();
    stream.seekEnd();
    stream.previous();
    stream.previous();
    stream.mergeWithNext();
    smap = getSortedMap(stream.getTokenMap());
    assertEquals("[not, or, to be]", smap.keySet().toString());
    assertEquals("[1, 1, 2]", smap.values().toString());
    stream = null;
  }
示例#3
0
  private static void tokenizeAndIndex(
      Properties properties, ConcurrentLinkedQueue<WikipediaDocument> queue)
      throws InterruptedException {
    /*
     * Pseudo-code:
     * 		1. Create a thread executor
     * 		2. For each runner, initialize the tokenizer as needed
     * 		3. Keep calling and putting as required
     */
    ExecutorService threadPool =
        Executors.newFixedThreadPool(
            Integer.valueOf(properties.get(IndexerConstants.NUM_TOKENIZER_THREADS).toString()));
    CompletionService<IndexableDocument> pool =
        new ExecutorCompletionService<IndexableDocument>(threadPool);
    ThreadPoolExecutor tpe = (ThreadPoolExecutor) threadPool;

    tokenizerThread = new Thread(new TokenizerRunner(queue, pool, properties));
    tokenizerThread.start();
    new Thread(new ParserChecker(queue)).start();

    // give the tokenizer a head start
    Thread.sleep(2000);

    long completed = 0, totalTasks = tpe.getTaskCount();
    long remaining = totalTasks - completed;

    IndexableDocument idoc = null;
    SharedDictionary docDict = new SharedDictionary(properties, INDEXFIELD.LINK);
    int currDocId;
    ThreadedIndexerRunner termRunner = new ThreadedIndexerRunner(properties);
    SingleIndexerRunner authIdxer =
        new SingleIndexerRunner(properties, INDEXFIELD.AUTHOR, INDEXFIELD.LINK, docDict, false);
    // SingleIndexerRunner catIdxer = new SingleIndexerRunner(properties, INDEXFIELD.CATEGORY,
    // INDEXFIELD.LINK, docDict, false);
    // SingleIndexerRunner linkIdxer = new SingleIndexerRunner(properties, INDEXFIELD.LINK,
    // INDEXFIELD.LINK, docDict, true);
    Map<String, Integer> tokenmap;

    try {
      while (remaining > 0) {
        idoc = pool.take().get();
        if (idoc != null) {
          currDocId = docDict.lookup(idoc.getDocumentIdentifier());
          TokenStream stream;
          try {
            for (INDEXFIELD fld : INDEXFIELD.values()) {
              stream = idoc.getStream(fld);

              if (stream != null) {
                tokenmap = stream.getTokenMap();

                if (tokenmap != null) {
                  switch (fld) {
                    case TERM:
                      termRunner.addToIndex(tokenmap, currDocId);
                      break;
                    case AUTHOR:
                      authIdxer.processTokenMap(currDocId, tokenmap);
                      break;
                    case CATEGORY:
                      // catIdxer.processTokenMap(currDocId,
                      // tokenmap);
                      break;
                    case LINK:
                      // linkIdxer.processTokenMap(
                      // currDocId, tokenmap);
                      break;
                  }
                }
              }
            }
          } catch (IndexerException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
          }
        }

        completed++;

        if (tokenizerThread.isAlive()) totalTasks = tpe.getTaskCount();

        remaining = totalTasks - completed;
      }
    } catch (ExecutionException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }

    try {
      termRunner.cleanup();
      authIdxer.cleanup();
      // catIdxer.cleanup();
      // linkIdxer.cleanup();
      docDict.writeToDisk();
      docDict.cleanUp();
    } catch (IndexerException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    while (termRunner.isFinished() && authIdxer.isFinished()) {
      // do nothing
      Thread.sleep(1000);
    }

    threadPool.shutdown();
  }