Exemple #1
0
    private static Map<INDEXFIELD, Tokenizer> initMap(Properties props) {
      HashMap<INDEXFIELD, Tokenizer> map =
          new HashMap<INDEXFIELD, Tokenizer>(INDEXFIELD.values().length);
      TokenizerFactory fact = TokenizerFactory.getInstance(props);
      for (INDEXFIELD fld : INDEXFIELD.values()) {
        map.put(fld, fact.getTokenizer(fld));
      }

      return map;
    }
Exemple #2
0
  private static void tokenizeAndIndex(
      Properties properties, ConcurrentLinkedQueue<WikipediaDocument> queue)
      throws InterruptedException {
    /*
     * Pseudo-code:
     * 		1. Create a thread executor
     * 		2. For each runner, initialize the tokenizer as needed
     * 		3. Keep calling and putting as required
     */
    ExecutorService threadPool =
        Executors.newFixedThreadPool(
            Integer.valueOf(properties.get(IndexerConstants.NUM_TOKENIZER_THREADS).toString()));
    CompletionService<IndexableDocument> pool =
        new ExecutorCompletionService<IndexableDocument>(threadPool);
    ThreadPoolExecutor tpe = (ThreadPoolExecutor) threadPool;

    tokenizerThread = new Thread(new TokenizerRunner(queue, pool, properties));
    tokenizerThread.start();
    new Thread(new ParserChecker(queue)).start();

    // give the tokenizer a head start
    Thread.sleep(2000);

    long completed = 0, totalTasks = tpe.getTaskCount();
    long remaining = totalTasks - completed;

    IndexableDocument idoc = null;
    SharedDictionary docDict = new SharedDictionary(properties, INDEXFIELD.LINK);
    int currDocId;
    ThreadedIndexerRunner termRunner = new ThreadedIndexerRunner(properties);
    SingleIndexerRunner authIdxer =
        new SingleIndexerRunner(properties, INDEXFIELD.AUTHOR, INDEXFIELD.LINK, docDict, false);
    // SingleIndexerRunner catIdxer = new SingleIndexerRunner(properties, INDEXFIELD.CATEGORY,
    // INDEXFIELD.LINK, docDict, false);
    // SingleIndexerRunner linkIdxer = new SingleIndexerRunner(properties, INDEXFIELD.LINK,
    // INDEXFIELD.LINK, docDict, true);
    Map<String, Integer> tokenmap;

    try {
      while (remaining > 0) {
        idoc = pool.take().get();
        if (idoc != null) {
          currDocId = docDict.lookup(idoc.getDocumentIdentifier());
          TokenStream stream;
          try {
            for (INDEXFIELD fld : INDEXFIELD.values()) {
              stream = idoc.getStream(fld);

              if (stream != null) {
                tokenmap = stream.getTokenMap();

                if (tokenmap != null) {
                  switch (fld) {
                    case TERM:
                      termRunner.addToIndex(tokenmap, currDocId);
                      break;
                    case AUTHOR:
                      authIdxer.processTokenMap(currDocId, tokenmap);
                      break;
                    case CATEGORY:
                      // catIdxer.processTokenMap(currDocId,
                      // tokenmap);
                      break;
                    case LINK:
                      // linkIdxer.processTokenMap(
                      // currDocId, tokenmap);
                      break;
                  }
                }
              }
            }
          } catch (IndexerException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
          }
        }

        completed++;

        if (tokenizerThread.isAlive()) totalTasks = tpe.getTaskCount();

        remaining = totalTasks - completed;
      }
    } catch (ExecutionException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }

    try {
      termRunner.cleanup();
      authIdxer.cleanup();
      // catIdxer.cleanup();
      // linkIdxer.cleanup();
      docDict.writeToDisk();
      docDict.cleanUp();
    } catch (IndexerException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    while (termRunner.isFinished() && authIdxer.isFinished()) {
      // do nothing
      Thread.sleep(1000);
    }

    threadPool.shutdown();
  }