@Override public String getInfo() { return this.getClass().getSimpleName() + "[" + ArrayUtils.join(wModelNames, ',' + ArrayUtils.join(dsmNames, ',')) + "]"; }
/** Makes a human readable form of this posting */ @Override public String toString() { String F = (fieldsCount > 0) ? ",F[" + ArrayUtils.join(fields, ",") + "]" : ""; String B = (hasBlocks > 0) ? ",B[" + ArrayUtils.join(blocks, ",") + "]" : ""; return "(" + id + "," + tf + F + B + ")"; }
static String[] getModelNames() throws Exception { String[] modelNames = ArrayUtils.parseCommaDelimitedString( ApplicationSetup.getProperty("fat.featured.scoring.matching.features", "")); if (modelNames.length == 1 && modelNames[0].equals("FILE")) { String filename = ApplicationSetup.getProperty("fat.featured.scoring.matching.features.file", null); if (filename == null) throw new IllegalArgumentException(); filename = ApplicationSetup.makeAbsolute(filename, ApplicationSetup.TERRIER_ETC); String line = null; final BufferedReader br = Files.openFileReader(filename); final List<String> models = new ArrayList<String>(); while ((line = br.readLine()) != null) { // ignore linee starting with comments if (line.startsWith("#")) continue; // remove trailing comments line = line.replaceAll("#.+$", ""); models.add(line.trim()); } br.close(); modelNames = models.toArray(new String[models.size()]); } return modelNames; }
/** * This load the a chunk (tfs, fields and blocks (optionally)) and decompress it * * @throws IOException */ protected final void decompress() throws IOException { tfsCodec.decompress(input, tfs, chunkSize); if (fieldsCount > 0) { for (int j = 0; j < fieldsCount; j++) { fieldsCodec.decompress(input, fieldsMatrix[j], chunkSize); } } if (hasBlocks > 0) { // if (hasBlocks > 1) { // tfsCodec.decompress(input, bfs, chunkSize); // } tfsCodec.decompress(input, bfs, chunkSize); int numBlocks = 0; for (int i = 0; i < chunkSize; i++) numBlocks += bfs[i]; blocksMatrix = ArrayUtils.growOrCreate(blocksMatrix, numBlocks); blocksCodec.decompress(input, blocksMatrix, numBlocks); } decompressed = true; }
/** * Constructs an instance of TRECQuery, that reads and stores all the queries from the files * defined in the trec.topics property. */ public TRECQuery() { // this(ApplicationSetup.getProperty("trec.topics", null)); try { String files[] = ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", "")); assert files.length > 0; Vector<String> vecStringQueries = new Vector<String>(); Vector<String> vecStringQueryIDs = new Vector<String>(); Vector<String> vecStringFiles = new Vector<String>(); for (int i = 0; i < files.length; i++) { if (this.extractQuery(files[i], vecStringQueries, vecStringQueryIDs)) { vecStringFiles.add(files[i]); } } this.topicFiles = vecStringQueries.toArray(new String[0]); this.queries = vecStringQueries.toArray(new String[0]); this.query_ids = vecStringQueryIDs.toArray(new String[0]); this.index = 0; } catch (Exception ioe) { logger.error("Problem getting trec.topics property:", ioe); return; } }
// TODO if this class extends BasicIndexer, then this method could be inherited public void createDirectIndex(Collection[] collections) { logger.info( "BlockIndexer creating direct index" + (Boolean.parseBoolean( ApplicationSetup.getProperty("block.delimiters.enabled", "false")) ? " delimited-block indexing enabled" : "")); currentIndex = Index.createNewIndex(path, prefix); lexiconBuilder = FieldScore.FIELDS_COUNT > 0 ? new LexiconBuilder( currentIndex, "lexicon", new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT), FieldLexiconEntry.class.getName()) : new LexiconBuilder( currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName()); // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon"); try { directIndexBuilder = FieldScore.FIELDS_COUNT > 0 ? new BlockFieldDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION) : new BlockDirectInvertedOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION); } catch (IOException ioe) { logger.error("Cannot make DirectInvertedOutputStream:", ioe); } docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry(); // int LexiconCount = 0; int numberOfDocuments = 0; // int numberOfTokens = 0; // long startBunchOfDocuments = System.currentTimeMillis(); final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0; boolean stopIndexing = false; for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) { Collection collection = collections[collectionNo]; long startCollection = System.currentTimeMillis(); boolean notLastDoc = false; // while(notLastDoc = collection.hasNext()) { while ((notLastDoc = collection.nextDocument())) { // get the next document from the collection // String docid = collection.getDocid(); // Document doc = collection.next(); Document doc = collection.getDocument(); if (doc == null) continue; numberOfDocuments++; // setup for parsing createDocumentPostings(); String term; numOfTokensInDocument = 0; numOfTokensInBlock = 0; blockId = 0; // get each term in the document while (!doc.endOfDocument()) { if ((term = doc.getNextTerm()) != null && !term.equals("")) { termFields = doc.getFields(); // pass term into TermPipeline (stop, stem etc) pipeline_first.processTerm(term); // the term pipeline will eventually add the term to this // object. } if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break; } // if we didn't index all tokens from document, // we need to get to the end of the document. while (!doc.endOfDocument()) doc.getNextTerm(); // we now have all terms in the DocumentTree pipeline_first.reset(); // process DocumentTree (tree of terms) try { if (termsInDocument.getDocumentLength() == 0) { // this document is empty, add the // minimum to the document index indexEmpty(doc.getAllProperties()); } else { /* index this docuent */ // numberOfTokens += numOfTokensInDocument; indexDocument(doc.getAllProperties(), termsInDocument); } } catch (Exception ioe) { logger.error("Failed to index " + doc.getProperty("docno"), ioe); } if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) { stopIndexing = true; break; } if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) { stopIndexing = true; break; } } long endCollection = System.currentTimeMillis(); long secs = ((endCollection - startCollection) / 1000); logger.info( "Collection #" + collectionNo + " took " + secs + "seconds to index " + "(" + numberOfDocuments + " documents)\n"); if (secs > 3600) logger.info( "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour"); if (!notLastDoc) { try { collection.close(); } catch (IOException e) { logger.warn("Couldnt close collection", e); } } } /* end of the collection has been reached */ finishedDirectIndexBuild(); currentIndex.addIndexStructure( "direct", "org.terrier.structures.BlockDirectIndex", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.addIndexStructureInputStream( "direct", "org.terrier.structures.BlockDirectIndexInputStream", "org.terrier.structures.Index,java.lang.String,java.lang.Class", "index,structureName," + (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass)); currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT); currentIndex.setIndexProperty( "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ",")); if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } else { currentIndex.addIndexStructure( "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", ""); } /* flush the index buffers */ directIndexBuilder.close(); docIndexBuilder.finishedCollections(); /* and then merge all the temporary lexicons */ lexiconBuilder.finishedDirectIndexBuild(); try { metaBuilder.close(); } catch (IOException ioe) { logger.error("Could not finish MetaIndexBuilder: ", ioe); } if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure( "lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}"); } /* reset the in-memory mapping of terms to term codes.*/ TermCodes.reset(); System.gc(); try { currentIndex.flush(); } catch (IOException ioe) { logger.error("Could not flush index properties: ", ioe); } }