@Override
 public String getInfo() {
   return this.getClass().getSimpleName()
       + "["
       + ArrayUtils.join(wModelNames, ',' + ArrayUtils.join(dsmNames, ','))
       + "]";
 }
  /** Makes a human readable form of this posting */
  @Override
  public String toString() {
    String F = (fieldsCount > 0) ? ",F[" + ArrayUtils.join(fields, ",") + "]" : "";
    String B = (hasBlocks > 0) ? ",B[" + ArrayUtils.join(blocks, ",") + "]" : "";

    return "(" + id + "," + tf + F + B + ")";
  }
 static String[] getModelNames() throws Exception {
   String[] modelNames =
       ArrayUtils.parseCommaDelimitedString(
           ApplicationSetup.getProperty("fat.featured.scoring.matching.features", ""));
   if (modelNames.length == 1 && modelNames[0].equals("FILE")) {
     String filename =
         ApplicationSetup.getProperty("fat.featured.scoring.matching.features.file", null);
     if (filename == null) throw new IllegalArgumentException();
     filename = ApplicationSetup.makeAbsolute(filename, ApplicationSetup.TERRIER_ETC);
     String line = null;
     final BufferedReader br = Files.openFileReader(filename);
     final List<String> models = new ArrayList<String>();
     while ((line = br.readLine()) != null) {
       // ignore linee starting with comments
       if (line.startsWith("#")) continue;
       // remove trailing comments
       line = line.replaceAll("#.+$", "");
       models.add(line.trim());
     }
     br.close();
     modelNames = models.toArray(new String[models.size()]);
   }
   return modelNames;
 }
  /**
   * This load the a chunk (tfs, fields and blocks (optionally)) and decompress it
   *
   * @throws IOException
   */
  protected final void decompress() throws IOException {

    tfsCodec.decompress(input, tfs, chunkSize);

    if (fieldsCount > 0) {
      for (int j = 0; j < fieldsCount; j++) {
        fieldsCodec.decompress(input, fieldsMatrix[j], chunkSize);
      }
    }

    if (hasBlocks > 0) {
      //			if (hasBlocks > 1) {
      //				tfsCodec.decompress(input, bfs, chunkSize);
      //			}
      tfsCodec.decompress(input, bfs, chunkSize);

      int numBlocks = 0;
      for (int i = 0; i < chunkSize; i++) numBlocks += bfs[i];
      blocksMatrix = ArrayUtils.growOrCreate(blocksMatrix, numBlocks);
      blocksCodec.decompress(input, blocksMatrix, numBlocks);
    }

    decompressed = true;
  }
Beispiel #5
0
  /**
   * Constructs an instance of TRECQuery, that reads and stores all the queries from the files
   * defined in the trec.topics property.
   */
  public TRECQuery() {
    // this(ApplicationSetup.getProperty("trec.topics", null));
    try {
      String files[] =
          ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("trec.topics", ""));
      assert files.length > 0;
      Vector<String> vecStringQueries = new Vector<String>();
      Vector<String> vecStringQueryIDs = new Vector<String>();
      Vector<String> vecStringFiles = new Vector<String>();
      for (int i = 0; i < files.length; i++) {
        if (this.extractQuery(files[i], vecStringQueries, vecStringQueryIDs)) {
          vecStringFiles.add(files[i]);
        }
      }

      this.topicFiles = vecStringQueries.toArray(new String[0]);
      this.queries = vecStringQueries.toArray(new String[0]);
      this.query_ids = vecStringQueryIDs.toArray(new String[0]);
      this.index = 0;
    } catch (Exception ioe) {
      logger.error("Problem getting trec.topics property:", ioe);
      return;
    }
  }
Beispiel #6
0
  // TODO if this class extends BasicIndexer, then this method could be inherited
  public void createDirectIndex(Collection[] collections) {
    logger.info(
        "BlockIndexer creating direct index"
            + (Boolean.parseBoolean(
                    ApplicationSetup.getProperty("block.delimiters.enabled", "false"))
                ? " delimited-block indexing enabled"
                : ""));
    currentIndex = Index.createNewIndex(path, prefix);
    lexiconBuilder =
        FieldScore.FIELDS_COUNT > 0
            ? new LexiconBuilder(
                currentIndex,
                "lexicon",
                new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT),
                FieldLexiconEntry.class.getName())
            : new LexiconBuilder(
                currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName());
    // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon");
    try {
      directIndexBuilder =
          FieldScore.FIELDS_COUNT > 0
              ? new BlockFieldDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION)
              : new BlockDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
      logger.error("Cannot make DirectInvertedOutputStream:", ioe);
    }
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    metaBuilder = createMetaIndexBuilder();
    emptyDocIndexEntry =
        (FieldScore.FIELDS_COUNT > 0)
            ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT)
            : new BasicDocumentIndexEntry();

    // int LexiconCount = 0;
    int numberOfDocuments = 0;
    // int numberOfTokens = 0;
    // long startBunchOfDocuments = System.currentTimeMillis();
    final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
    boolean stopIndexing = false;
    for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) {
      Collection collection = collections[collectionNo];
      long startCollection = System.currentTimeMillis();
      boolean notLastDoc = false;
      // while(notLastDoc = collection.hasNext()) {
      while ((notLastDoc = collection.nextDocument())) {
        // get the next document from the collection

        // String docid = collection.getDocid();
        // Document doc = collection.next();
        Document doc = collection.getDocument();

        if (doc == null) continue;

        numberOfDocuments++;
        // setup for parsing
        createDocumentPostings();
        String term;
        numOfTokensInDocument = 0;
        numOfTokensInBlock = 0;
        blockId = 0;
        // get each term in the document
        while (!doc.endOfDocument()) {
          if ((term = doc.getNextTerm()) != null && !term.equals("")) {
            termFields = doc.getFields();
            // pass term into TermPipeline (stop, stem etc)
            pipeline_first.processTerm(term);
            // the term pipeline will eventually add the term to this
            // object.
          }
          if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break;
        }
        // if we didn't index all tokens from document,
        // we need to get to the end of the document.
        while (!doc.endOfDocument()) doc.getNextTerm();
        // we now have all terms in the DocumentTree

        pipeline_first.reset();
        // process DocumentTree (tree of terms)
        try {
          if (termsInDocument.getDocumentLength() == 0) {
            // this document is empty, add the
            // minimum to the document index
            indexEmpty(doc.getAllProperties());
          } else {
              /* index this docuent */
            // numberOfTokens += numOfTokensInDocument;
            indexDocument(doc.getAllProperties(), termsInDocument);
          }
        } catch (Exception ioe) {
          logger.error("Failed to index " + doc.getProperty("docno"), ioe);
        }
        if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) {
          stopIndexing = true;
          break;
        }

        if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) {
          stopIndexing = true;
          break;
        }
      }
      long endCollection = System.currentTimeMillis();
      long secs = ((endCollection - startCollection) / 1000);
      logger.info(
          "Collection #"
              + collectionNo
              + " took "
              + secs
              + "seconds to index "
              + "("
              + numberOfDocuments
              + " documents)\n");
      if (secs > 3600)
        logger.info(
            "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour");

      if (!notLastDoc) {
        try {
          collection.close();
        } catch (IOException e) {
          logger.warn("Couldnt close collection", e);
        }
      }
    }

    /* end of the collection has been reached */
    finishedDirectIndexBuild();
    currentIndex.addIndexStructure(
        "direct",
        "org.terrier.structures.BlockDirectIndex",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.addIndexStructureInputStream(
        "direct",
        "org.terrier.structures.BlockDirectIndexInputStream",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT);
    currentIndex.setIndexProperty(
        "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "document-factory",
          FieldDocumentIndexEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    } else {
      currentIndex.addIndexStructure(
          "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
    }
    /* flush the index buffers */
    directIndexBuilder.close();
    docIndexBuilder.finishedCollections();
    /* and then merge all the temporary lexicons */
    lexiconBuilder.finishedDirectIndexBuild();
    try {
      metaBuilder.close();
    } catch (IOException ioe) {
      logger.error("Could not finish MetaIndexBuilder: ", ioe);
    }
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "lexicon-valuefactory",
          FieldLexiconEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    }
    /* reset the in-memory mapping of terms to term codes.*/
    TermCodes.reset();
    System.gc();
    try {
      currentIndex.flush();
    } catch (IOException ioe) {
      logger.error("Could not flush index properties: ", ioe);
    }
  }