/** * This adds a document to the direct and document indexes, as well as it's terms to the lexicon. * Handled internally by the methods indexFieldDocument and indexNoFieldDocument. * * @param docProperties Map<String,String> properties of the document * @param _termsInDocument DocumentPostingList the terms in the document. */ protected void indexDocument( Map<String, String> docProperties, DocumentPostingList _termsInDocument) throws Exception { /* add words to lexicontree */ lexiconBuilder.addDocumentTerms(_termsInDocument); /* add doc postings to the direct index */ BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2()); /* add doc to documentindex */ DocumentIndexEntry die = _termsInDocument.getDocumentStatistics(); die.setBitIndexPointer(dirIndexPost); docIndexBuilder.addEntryToBuffer(die); /** add doc metadata to index */ metaBuilder.writeDocumentEntry(docProperties); }
// TODO if this class extends BasicIndexer, then this method could be inherited public void createDirectIndex(Collection[] collections) { logger.info( "BlockIndexer creating direct index" + (Boolean.parseBoolean( ApplicationSetup.getProperty("block.delimiters.enabled", "false")) ? " delimited-block indexing enabled" : "")); currentIndex = Index.createNewIndex(path, prefix); lexiconBuilder = FieldScore.FIELDS_COUNT > 0 ? new LexiconBuilder( currentIndex, "lexicon", new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT), FieldLexiconEntry.class.getName()) : new LexiconBuilder( currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName()); // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon"); try { directIndexBuilder = FieldScore.FIELDS_COUNT > 0 ? new BlockFieldDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION) : new BlockDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION); } catch (IOException ioe) { logger.error("Cannot make DirectInvertedOutputStream:", ioe); } docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry(); // int LexiconCount = 0; int numberOfDocuments = 0; // int numberOfTokens = 0; // long startBunchOfDocuments = System.currentTimeMillis(); final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0; boolean stopIndexing = false; for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) { Collection collection = collections[collectionNo]; long startCollection = System.currentTimeMillis(); boolean notLastDoc = false; // while(notLastDoc = collection.hasNext()) { while ((notLastDoc = collection.nextDocument())) { // get the next document from the collection // String docid = collection.getDocid(); // Document doc = collection.next(); Document doc = collection.getDocument(); if (doc == null) continue; numberOfDocuments++; // setup for parsing createDocumentPostings(); String term; numOfTokensInDocument = 0; numOfTokensInBlock = 0; blockId = 0; // get each term in the document while (!doc.endOfDocument()) { if ((term = doc.getNextTerm()) != null && !term.equals("")) { termFields = doc.getFields(); // pass term into TermPipeline (stop, stem etc) pipeline_first.processTerm(term); // the term pipeline will eventually add the term to this // object. } if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break; } // if we didn't index all tokens from document, // we need to get to the end of the document. while (!doc.endOfDocument()) doc.getNextTerm(); // we now have all terms in the DocumentTree pipeline_first.reset(); // process DocumentTree (tree of terms) try { if (termsInDocument.getDocumentLength() == 0) { // this document is empty, add the // minimum to the document index indexEmpty(doc.getAllProperties()); } else { /* index this docuent */ // numberOfTokens += numOfTokensInDocument; indexDocument(doc.getAllProperties(), termsInDocument); } } catch (Exception ioe) { logger.error("Failed to index " + doc.getProperty("docno"), ioe); } if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) { stopIndexing = true; break; } if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) { stopIndexing = true; break; } } long endCollection = System.currentTimeMillis(); long secs = ((endCollection - startCollection) / 1000); logger.info( "Collection #" + collectionNo + " took " + secs + "seconds to index " + "(" + numberOfDocuments + " documents)\n"); if (secs > 3600) logger.info( "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour"); if (!notLastDoc) { try { collection.close(); } catch (IOException e) { logger.warn("Couldnt close collection", e); } } } /* end of the collection has been reached */ finishedDirectIndexBuild(); currentIndex.addIndexStructure( "direct", "org.terrier.structures.BlockDirectIndex", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.addIndexStructureInputStream( "direct", "org.terrier.structures.BlockDirectIndexInputStream", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT); currentIndex.setIndexProperty( "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ",")); if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } else { currentIndex.addIndexStructure( "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", ""); } /* flush the index buffers */ directIndexBuilder.close(); docIndexBuilder.finishedCollections(); /* and then merge all the temporary lexicons */ lexiconBuilder.finishedDirectIndexBuild(); try { metaBuilder.close(); } catch (IOException ioe) { logger.error("Could not finish MetaIndexBuilder: ", ioe); } if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } /* reset the in-memory mapping of terms to term codes.*/ TermCodes.reset(); System.gc(); try { currentIndex.flush(); } catch (IOException ioe) { logger.error("Could not flush index properties: ", ioe); } }