Esempio n. 1
0
 /**
  * This adds a document to the direct and document indexes, as well as it's terms to the lexicon.
  * Handled internally by the methods indexFieldDocument and indexNoFieldDocument.
  *
  * @param docProperties Map<String,String> properties of the document
  * @param _termsInDocument DocumentPostingList the terms in the document.
  */
 protected void indexDocument(
     Map<String, String> docProperties, DocumentPostingList _termsInDocument) throws Exception {
   /* add words to lexicontree */
   lexiconBuilder.addDocumentTerms(_termsInDocument);
   /* add doc postings to the direct index */
   BitIndexPointer dirIndexPost =
       directIndexBuilder.writePostings(_termsInDocument.getPostings2());
   /* add doc to documentindex */
   DocumentIndexEntry die = _termsInDocument.getDocumentStatistics();
   die.setBitIndexPointer(dirIndexPost);
   docIndexBuilder.addEntryToBuffer(die);
   /** add doc metadata to index */
   metaBuilder.writeDocumentEntry(docProperties);
 }
Esempio n. 2
0
  // TODO if this class extends BasicIndexer, then this method could be inherited
  public void createDirectIndex(Collection[] collections) {
    logger.info(
        "BlockIndexer creating direct index"
            + (Boolean.parseBoolean(
                    ApplicationSetup.getProperty("block.delimiters.enabled", "false"))
                ? " delimited-block indexing enabled"
                : ""));
    currentIndex = Index.createNewIndex(path, prefix);
    lexiconBuilder =
        FieldScore.FIELDS_COUNT > 0
            ? new LexiconBuilder(
                currentIndex,
                "lexicon",
                new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT),
                FieldLexiconEntry.class.getName())
            : new LexiconBuilder(
                currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName());
    // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon");
    try {
      directIndexBuilder =
          FieldScore.FIELDS_COUNT > 0
              ? new BlockFieldDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION)
              : new BlockDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
      logger.error("Cannot make DirectInvertedOutputStream:", ioe);
    }
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    metaBuilder = createMetaIndexBuilder();
    emptyDocIndexEntry =
        (FieldScore.FIELDS_COUNT > 0)
            ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT)
            : new BasicDocumentIndexEntry();

    // int LexiconCount = 0;
    int numberOfDocuments = 0;
    // int numberOfTokens = 0;
    // long startBunchOfDocuments = System.currentTimeMillis();
    final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
    boolean stopIndexing = false;
    for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) {
      Collection collection = collections[collectionNo];
      long startCollection = System.currentTimeMillis();
      boolean notLastDoc = false;
      // while(notLastDoc = collection.hasNext()) {
      while ((notLastDoc = collection.nextDocument())) {
        // get the next document from the collection

        // String docid = collection.getDocid();
        // Document doc = collection.next();
        Document doc = collection.getDocument();

        if (doc == null) continue;

        numberOfDocuments++;
        // setup for parsing
        createDocumentPostings();
        String term;
        numOfTokensInDocument = 0;
        numOfTokensInBlock = 0;
        blockId = 0;
        // get each term in the document
        while (!doc.endOfDocument()) {
          if ((term = doc.getNextTerm()) != null && !term.equals("")) {
            termFields = doc.getFields();
            // pass term into TermPipeline (stop, stem etc)
            pipeline_first.processTerm(term);
            // the term pipeline will eventually add the term to this
            // object.
          }
          if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break;
        }
        // if we didn't index all tokens from document,
        // we need to get to the end of the document.
        while (!doc.endOfDocument()) doc.getNextTerm();
        // we now have all terms in the DocumentTree

        pipeline_first.reset();
        // process DocumentTree (tree of terms)
        try {
          if (termsInDocument.getDocumentLength() == 0) {
            // this document is empty, add the
            // minimum to the document index
            indexEmpty(doc.getAllProperties());
          } else {
              /* index this docuent */
            // numberOfTokens += numOfTokensInDocument;
            indexDocument(doc.getAllProperties(), termsInDocument);
          }
        } catch (Exception ioe) {
          logger.error("Failed to index " + doc.getProperty("docno"), ioe);
        }
        if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) {
          stopIndexing = true;
          break;
        }

        if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) {
          stopIndexing = true;
          break;
        }
      }
      long endCollection = System.currentTimeMillis();
      long secs = ((endCollection - startCollection) / 1000);
      logger.info(
          "Collection #"
              + collectionNo
              + " took "
              + secs
              + "seconds to index "
              + "("
              + numberOfDocuments
              + " documents)\n");
      if (secs > 3600)
        logger.info(
            "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour");

      if (!notLastDoc) {
        try {
          collection.close();
        } catch (IOException e) {
          logger.warn("Couldnt close collection", e);
        }
      }
    }

    /* end of the collection has been reached */
    finishedDirectIndexBuild();
    currentIndex.addIndexStructure(
        "direct",
        "org.terrier.structures.BlockDirectIndex",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.addIndexStructureInputStream(
        "direct",
        "org.terrier.structures.BlockDirectIndexInputStream",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT);
    currentIndex.setIndexProperty(
        "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "document-factory",
          FieldDocumentIndexEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    } else {
      currentIndex.addIndexStructure(
          "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
    }
    /* flush the index buffers */
    directIndexBuilder.close();
    docIndexBuilder.finishedCollections();
    /* and then merge all the temporary lexicons */
    lexiconBuilder.finishedDirectIndexBuild();
    try {
      metaBuilder.close();
    } catch (IOException ioe) {
      logger.error("Could not finish MetaIndexBuilder: ", ioe);
    }
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "lexicon-valuefactory",
          FieldLexiconEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    }
    /* reset the in-memory mapping of terms to term codes.*/
    TermCodes.reset();
    System.gc();
    try {
      currentIndex.flush();
    } catch (IOException ioe) {
      logger.error("Could not flush index properties: ", ioe);
    }
  }