Exemplos de Index em Java, exemplos de org.terrier.structures.Index em Java

Exemplo n.º 1

0

Exibir arquivo

Arquivo: CooccurenceMap.java Projeto: guidozuc/adcs2015-NTLM

  /**
   * Builds a CooccurenceMap by iterating over the documents of the collection. It counts document
   * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity:
   * O(d * t *t/2) = O(d t^2) where d is the number of documents in the collection and t is the
   * average number of terms per documents. Note that t = avg doc len
   */
  public void build_full_cooccurencemap_docversion() throws IOException {
    PostingIndex di = index.getDirectIndex();
    DocumentIndex doi = index.getDocumentIndex();
    Lexicon<String> lex = index.getLexicon();
    for (int docid = 0; docid < doi.getNumberOfDocuments(); docid++) {
      if (docid % 1000 == 0)
        System.out.println(
            "Processing... " + 100.0 * ((double) docid) / doi.getNumberOfDocuments() + "%");
      IterablePosting postings = di.getPostings(doi.getDocumentEntry(docid));
      Vector<String> seenterms = new Vector<String>();
      while (postings.next() != IterablePosting.EOL) {
        Map.Entry<String, LexiconEntry> lee = lex.getLexiconEntry(postings.getId());
        String termw = lee.getKey();
        if (lee.getValue().getFrequency() < this.rarethreshold
            || lee.getValue().getFrequency() > this.topthreshold) continue;

        HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>();
        if (this.cooccurencemap.containsKey(termw)) {
          w_cooccurence = this.cooccurencemap.get(termw);
          this.cooccurencemap.remove(termw);
        }
        Iterator<String> it = seenterms.iterator();
        while (it.hasNext()) {
          String termu = it.next();
          int count = 1;
          if (w_cooccurence.containsKey(termu)) {
            count = count + w_cooccurence.get(termu);
            w_cooccurence.remove(termu);
          }
          w_cooccurence.put(termu, count);

          // System.out.println(termw + ": " + w_cooccurence);
          // and now I need to do the symmetric
          HashMap<String, Integer> u_cooccurence = new HashMap<String, Integer>();
          if (cooccurencemap.containsKey(termu)) {
            u_cooccurence = cooccurencemap.get(termu);
            cooccurencemap.remove(termu);
          }
          int countu = 1;
          if (u_cooccurence.containsKey(termw)) {
            countu = countu + u_cooccurence.get(termw);
            u_cooccurence.remove(termw);
          }
          u_cooccurence.put(termw, count);
          cooccurencemap.put(termu, u_cooccurence);
          // System.out.println(termu + ": " + u_cooccurence);
        }

        cooccurencemap.put(termw, w_cooccurence);
        seenterms.add(termw); // I add only the termw that are within the thresholds
      }
    }
  }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: InteractiveQuerying.java Projeto: mvoelske/terrier

 /** Closes the used structures. */
 public void close() {
   try {
     index.close();
   } catch (IOException ioe) {
     logger.warn("Problem closing index", ioe);
   }
 }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: BlockIndexer.java Projeto: Neo123001/terrier_da

  /**
   * Creates the inverted index from the already created direct index, document index and lexicon.
   * It saves block information and possibly field information as well.
   *
   * @see org.terrier.indexing.Indexer#createInvertedIndex()
   */
  public void createInvertedIndex() {
    if (currentIndex == null) {
      currentIndex = Index.createIndex(path, prefix);
      if (currentIndex == null) {
        logger.error("No index at (" + path + "," + prefix + ") to build an inverted index for ");
      }
    }
    long beginTimestamp = System.currentTimeMillis();

    if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0) {
      logger.error("Index has no terms. Inverted index creation aborted.");
      return;
    }
    if (currentIndex.getCollectionStatistics().getNumberOfDocuments() == 0) {
      logger.error("Index has no documents. Inverted index creation aborted.");
      return;
    }

    logger.info("Started building the block inverted index...");
    invertedIndexBuilder = new BlockInvertedIndexBuilder(currentIndex, "inverted");
    invertedIndexBuilder.createInvertedIndex();
    this.finishedInvertedIndexBuild();
    try {
      currentIndex.flush();
    } catch (IOException ioe) {
      logger.error("Cannot flush index: ", ioe);
    }

    long endTimestamp = System.currentTimeMillis();
    logger.info("Finished building the block inverted index...");
    long seconds = (endTimestamp - beginTimestamp) / 1000;
    logger.info("Time elapsed for inverted file: " + seconds);
  }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: InteractiveQuerying.java Projeto: mvoelske/terrier

 /** Loads index(s) from disk. */
 protected void loadIndex() {
   long startLoading = System.currentTimeMillis();
   index = Index.createIndex();
   if (index == null) {
     logger.fatal("Failed to load index. Perhaps index files are missing");
   }
   long endLoading = System.currentTimeMillis();
   if (logger.isInfoEnabled())
     logger.info("time to intialise index : " + ((endLoading - startLoading) / 1000.0D));
 }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: CooccurenceMap.java Projeto: guidozuc/adcs2015-NTLM

 // returns the documents were w occurs in - binary version
 Set<Integer> occursin_binary(String v) throws IOException {
   Set<Integer> vecv = new HashSet<Integer>();
   Lexicon<String> lex = index.getLexicon();
   LexiconEntry le = lex.getLexiconEntry(v);
   IterablePosting postings = inv.getPostings(le);
   while (postings.next() != IterablePosting.EOL) {
     vecv.add(postings.getId());
   }
   return vecv;
 }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: InteractiveQuerying.java Projeto: mvoelske/terrier

  /**
   * Prints the results
   *
   * @param pw PrintWriter the file to write the results to.
   * @param q SearchRequest the search request to get results from.
   */
  public void printResults(PrintWriter pw, SearchRequest q) throws IOException {
    ResultSet set = q.getResultSet();
    int[] docids = set.getDocids();
    double[] scores = set.getScores();
    int minimum = RESULTS_LENGTH;
    // if the minimum number of documents is more than the
    // number of documents in the results, aw.length, then
    // set minimum = aw.length
    if (minimum > set.getResultSize()) minimum = set.getResultSize();
    if (verbose)
      if (set.getResultSize() > 0)
        pw.write("\n\tDisplaying 1-" + set.getResultSize() + " results\n");
      else pw.write("\n\tNo results\n");
    if (set.getResultSize() == 0) return;

    int metaKeyId = 0;
    final int metaKeyCount = metaKeys.length;
    String[][] docNames = new String[metaKeyCount][];
    for (String metaIndexDocumentKey : metaKeys) {
      if (set.hasMetaItems(metaIndexDocumentKey)) {
        docNames[metaKeyId] = set.getMetaItems(metaIndexDocumentKey);
      } else {
        final MetaIndex metaIndex = index.getMetaIndex();
        docNames[metaKeyId] = metaIndex.getItems(metaIndexDocumentKey, docids);
      }
      metaKeyId++;
    }

    StringBuilder sbuffer = new StringBuilder();
    // the results are ordered in asceding order
    // with respect to the score. For example, the
    // document with the highest score has score
    // score[scores.length-1] and its docid is
    // docid[docids.length-1].
    int start = 0;
    int end = minimum;
    for (int i = start; i < end; i++) {
      if (scores[i] <= 0d) continue;
      sbuffer.append(i);
      sbuffer.append(" ");
      for (metaKeyId = 0; metaKeyId < metaKeyCount; metaKeyId++) {
        sbuffer.append(docNames[metaKeyId][i]);
        sbuffer.append(" ");
      }
      sbuffer.append(docids[i]);
      sbuffer.append(" ");
      sbuffer.append(scores[i]);
      sbuffer.append('\n');
    }
    // System.out.println(sbuffer.toString());
    pw.write(sbuffer.toString());
    pw.flush();
    // pw.write("finished outputting\n");
  }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: CooccurenceMap.java Projeto: guidozuc/adcs2015-NTLM

  // returns the documents were w occurs in
  HashMap<Integer, Integer> occursin(String v) throws IOException {
    HashMap<Integer, Integer> docsofv = new HashMap<Integer, Integer>();

    // MetaIndex meta = index.getMetaIndex();
    Lexicon<String> lex = index.getLexicon();
    LexiconEntry lev = lex.getLexiconEntry(v);
    IterablePosting postings = inv.getPostings(lev);
    while (postings.next() != IterablePosting.EOL) {
      docsofv.put(postings.getId(), postings.getFrequency());
    }
    return docsofv;
  }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: TRECDocnoOutputFormat.java Projeto: mvoelske/terrier

 /** method which extracts the docnos for the prescribed resultset */
 protected String[] obtainDocnos(
     final String metaIndexDocumentKey, final SearchRequest q, final ResultSet set)
     throws IOException {
   String[] docnos;
   if (set.hasMetaItems(metaIndexDocumentKey)) {
     docnos = set.getMetaItems(metaIndexDocumentKey);
   } else {
     final MetaIndex metaIndex = index.getMetaIndex();
     docnos = metaIndex.getItems(metaIndexDocumentKey, set.getDocids());
   }
   return docnos;
 }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: QueryExpansion.java Projeto: mvoelske/terrier-core-4.1

  /**
   * Runs the actual query expansion
   *
   * @see
   *     org.terrier.querying.PostProcess#process(org.terrier.querying.Manager,org.terrier.querying.SearchRequest)
   */
  public void process(Manager manager, SearchRequest q) {
    Index index = getIndex(manager);
    lastIndex = index;
    documentIndex = index.getDocumentIndex();
    invertedIndex = index.getInvertedIndex();
    lexicon = index.getLexicon();
    collStats = index.getCollectionStatistics();
    directIndex = index.getDirectIndex();
    metaIndex = index.getMetaIndex();
    if (directIndex == null) {
      logger.error("This index does not have a direct index. Query expansion disabled!!");
      return;
    }
    logger.debug("Starting query expansion post-processing.");
    // get the query expansion model to use
    String qeModel = q.getControl("qemodel");
    if (qeModel == null || qeModel.length() == 0) {
      logger.warn(
          "qemodel control not set for QueryExpansion" + " post process. Using default model Bo1");
      qeModel = "Bo1";
    }
    setQueryExpansionModel(getQueryExpansionModel(qeModel));
    if (logger.isDebugEnabled()) {
      logger.info("query expansion model: " + QEModel.getInfo());
    }
    MatchingQueryTerms queryTerms = ((Request) q).getMatchingQueryTerms();
    if (queryTerms == null) {
      logger.warn("No query terms for this query. Skipping QE");
      return;
    }
    // get the expanded query terms
    try {
      expandQuery(queryTerms, (Request) q);
    } catch (IOException ioe) {
      logger.error("IOException while expanding query, skipping QE", ioe);
      return;
    }
    if (logger.isDebugEnabled()) {
      logger.info("query length after expansion: " + queryTerms.length());
      logger.info("Expanded query: ");
    }
    final String[] newQueryTerms = queryTerms.getTerms();
    StringBuilder newQuery = new StringBuilder();
    for (int i = 0; i < newQueryTerms.length; i++) {
      try {
        if (logger.isDebugEnabled()) {
          logger.info(
              (i + 1)
                  + ": "
                  + newQueryTerms[i]
                  + ", normalisedFrequency: "
                  + Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 4));
        }
        newQuery.append(newQueryTerms[i]);
        newQuery.append('^');
        newQuery.append(Rounding.toString(queryTerms.getTermWeight(newQueryTerms[i]), 9));
        newQuery.append(' ');
      } catch (NullPointerException npe) {
        logger.error("Nullpointer exception occured in Query Expansion dumping of new Query", npe);
      }
    }

    logger.debug("NEWQUERY " + q.getQueryID() + " " + newQuery.toString());
    lastExpandedQuery = newQuery.toString();
    q.setControl("QE.ExpandedQuery", newQuery.toString());
    final boolean no2ndPass =
        Boolean.parseBoolean(ApplicationSetup.getProperty("qe.no.2nd.matching", "false"));
    if (no2ndPass) {
      return;
    }

    // run retrieval process again for the expanded query
    logger.info("Accessing inverted file for expanded query " + q.getQueryID());
    manager.runMatching(q);
  }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: TerrierESAIndex.java Projeto: Jondari/mlid2

 public void readIndex() throws IOException {
   index = terrierIndexFactory.readIndex(indexId, language);
   documentIndex = index.getDocumentIndex();
   metaIndex = index.getMetaIndex();
 }

Exemplo n.º 11

0

Exibir arquivo

Arquivo: TuneLM.java Projeto: guidozuc/adcs2015-NTLM

  /**
   * @param args
   * @throws IOException
   * @throws InterruptedException
   */
  public static void main(String[] args) throws IOException, InterruptedException {
    System.out.println("Usage: ");
    System.out.println("args[0]: path to terrier.home");
    System.out.println("args[1]: path to index");
    System.out.println("args[2]: path to trec query file");
    System.out.println("args[3]: path to result file (including name of result file)");

    System.setProperty("terrier.home", args[0]);
    Index index = Index.createIndex(args[1], "data");
    System.out.println(index.getEnd());

    HashMap<String, String> trecqueries = new HashMap<String, String>();
    BufferedReader br = new BufferedReader(new FileReader(args[2]));
    String line = null;
    while ((line = br.readLine()) != null) {
      String[] input = line.split(" ");
      String qid = input[0];
      String query = "";
      for (int i = 1; i < input.length; i++) query = query + " " + input[i];

      query = query.replaceAll("-", " ");
      query = query.replaceAll("\\p{Punct}", "");
      query = query.substring(1, query.length());
      trecqueries.put(qid, query.toLowerCase());
    }
    br.close();

    double[] muvalues = {100.0, 500.0, 1000.0, 1500.0, 2000.0, 2500.0, 3000.0, 3500.0, 4000.0};
    for (int i = 0; i < muvalues.length; i++) {
      double mu = muvalues[i];

      TranslationLMManager tlm = new TranslationLMManager(index);
      tlm.setTranslation("null");
      tlm.setDirMu(mu);
      TRECDocnoOutputFormat TRECoutput = new TRECDocnoOutputFormat(index);
      PrintWriter pt =
          new PrintWriter(new File(args[3] + "_dir_mu_" + String.valueOf(mu) + ".txt"));
      /*
      TranslationLMManager tlm_theory = new TranslationLMManager(index);
      tlm_theory.setTranslation("dir_theory");
      tlm_theory.setDirMu(mu);
      TRECDocnoOutputFormat TRECoutput_theory = new TRECDocnoOutputFormat(index);
      PrintWriter pt_theory = new PrintWriter(new File(args[3]+"_dir_theory_mu_" + String.valueOf(mu) + ".txt"));
      */
      for (String qid : trecqueries.keySet()) {
        String query = trecqueries.get(qid);
        System.out.println(query + " - " + qid);

        System.out.println("Scoring with Dir LM; mu=" + mu);
        // scoring with LM dir
        Request rq = new Request();
        rq.setOriginalQuery(query);
        rq.setIndex(index);
        rq.setQueryID(qid);
        rq = tlm.runMatching(rq, "null", "dir");
        /*
        DocumentIndex doi = index.getDocumentIndex();
        MetaIndex meta = index.getMetaIndex();
        int docid = 1247748; //docids are 0-based
        DocumentIndexEntry die = doi.getDocumentEntry(docid);
        System.out.println(meta.getItem("docno", docid) + ":" + die.getDocumentLength());
        die = doi.getDocumentEntry(docid+1);
        System.out.println(meta.getItem("docno", docid+1) + ":" + die.getDocumentLength());
        meta.
        die = doi.getDocumentEntry(docid+2);
        System.out.println(meta.getItem("docno", docid+2) + ":" + die.getDocumentLength());
        */
        TRECoutput.printResults(pt, rq, "dir", "Q0", 1000);
        /*
        Request rq_theory = new Request();
        rq_theory.setOriginalQuery(query);
        rq_theory.setIndex(index);
        rq_theory.setQueryID(qid);
        rq_theory = tlm_theory.runMatching(rq_theory, "dir_theory", "dir");
        TRECoutput_theory.printResults(pt_theory, rq_theory, "dir_theory", "Q0", 1000);
        */
      }
      pt.flush();
      pt.close();
      // pt_theory.flush();
      // pt_theory.close();
    }
  }

Exemplo n.º 12

0

Exibir arquivo

Arquivo: BlockIndexer.java Projeto: Neo123001/terrier_da

  // TODO if this class extends BasicIndexer, then this method could be inherited
  public void createDirectIndex(Collection[] collections) {
    logger.info(
        "BlockIndexer creating direct index"
            + (Boolean.parseBoolean(
                    ApplicationSetup.getProperty("block.delimiters.enabled", "false"))
                ? " delimited-block indexing enabled"
                : ""));
    currentIndex = Index.createNewIndex(path, prefix);
    lexiconBuilder =
        FieldScore.FIELDS_COUNT > 0
            ? new LexiconBuilder(
                currentIndex,
                "lexicon",
                new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT),
                FieldLexiconEntry.class.getName())
            : new LexiconBuilder(
                currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName());
    // lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon");
    try {
      directIndexBuilder =
          FieldScore.FIELDS_COUNT > 0
              ? new BlockFieldDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION)
              : new BlockDirectInvertedOutputStream(
                  currentIndex.getPath()
                      + ApplicationSetup.FILE_SEPARATOR
                      + currentIndex.getPrefix()
                      + "."
                      + "direct"
                      + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
      logger.error("Cannot make DirectInvertedOutputStream:", ioe);
    }
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    metaBuilder = createMetaIndexBuilder();
    emptyDocIndexEntry =
        (FieldScore.FIELDS_COUNT > 0)
            ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT)
            : new BasicDocumentIndexEntry();

    // int LexiconCount = 0;
    int numberOfDocuments = 0;
    // int numberOfTokens = 0;
    // long startBunchOfDocuments = System.currentTimeMillis();
    final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
    boolean stopIndexing = false;
    for (int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++) {
      Collection collection = collections[collectionNo];
      long startCollection = System.currentTimeMillis();
      boolean notLastDoc = false;
      // while(notLastDoc = collection.hasNext()) {
      while ((notLastDoc = collection.nextDocument())) {
        // get the next document from the collection

        // String docid = collection.getDocid();
        // Document doc = collection.next();
        Document doc = collection.getDocument();

        if (doc == null) continue;

        numberOfDocuments++;
        // setup for parsing
        createDocumentPostings();
        String term;
        numOfTokensInDocument = 0;
        numOfTokensInBlock = 0;
        blockId = 0;
        // get each term in the document
        while (!doc.endOfDocument()) {
          if ((term = doc.getNextTerm()) != null && !term.equals("")) {
            termFields = doc.getFields();
            // pass term into TermPipeline (stop, stem etc)
            pipeline_first.processTerm(term);
            // the term pipeline will eventually add the term to this
            // object.
          }
          if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT) break;
        }
        // if we didn't index all tokens from document,
        // we need to get to the end of the document.
        while (!doc.endOfDocument()) doc.getNextTerm();
        // we now have all terms in the DocumentTree

        pipeline_first.reset();
        // process DocumentTree (tree of terms)
        try {
          if (termsInDocument.getDocumentLength() == 0) {
            // this document is empty, add the
            // minimum to the document index
            indexEmpty(doc.getAllProperties());
          } else {
              /* index this docuent */
            // numberOfTokens += numOfTokensInDocument;
            indexDocument(doc.getAllProperties(), termsInDocument);
          }
        } catch (Exception ioe) {
          logger.error("Failed to index " + doc.getProperty("docno"), ioe);
        }
        if (MAX_DOCS_PER_BUILDER > 0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER) {
          stopIndexing = true;
          break;
        }

        if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno"))) {
          stopIndexing = true;
          break;
        }
      }
      long endCollection = System.currentTimeMillis();
      long secs = ((endCollection - startCollection) / 1000);
      logger.info(
          "Collection #"
              + collectionNo
              + " took "
              + secs
              + "seconds to index "
              + "("
              + numberOfDocuments
              + " documents)\n");
      if (secs > 3600)
        logger.info(
            "Rate: " + ((double) numberOfDocuments / ((double) secs / 3600.0d)) + " docs/hour");

      if (!notLastDoc) {
        try {
          collection.close();
        } catch (IOException e) {
          logger.warn("Couldnt close collection", e);
        }
      }
    }

    /* end of the collection has been reached */
    finishedDirectIndexBuild();
    currentIndex.addIndexStructure(
        "direct",
        "org.terrier.structures.BlockDirectIndex",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.addIndexStructureInputStream(
        "direct",
        "org.terrier.structures.BlockDirectIndexInputStream",
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"
            + (FieldScore.FIELDS_COUNT > 0
                ? fieldDirectIndexPostingIteratorClass
                : basicDirectIndexPostingIteratorClass));
    currentIndex.setIndexProperty("index.direct.fields.count", "" + FieldScore.FIELDS_COUNT);
    currentIndex.setIndexProperty(
        "index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "document-factory",
          FieldDocumentIndexEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    } else {
      currentIndex.addIndexStructure(
          "document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
    }
    /* flush the index buffers */
    directIndexBuilder.close();
    docIndexBuilder.finishedCollections();
    /* and then merge all the temporary lexicons */
    lexiconBuilder.finishedDirectIndexBuild();
    try {
      metaBuilder.close();
    } catch (IOException ioe) {
      logger.error("Could not finish MetaIndexBuilder: ", ioe);
    }
    if (FieldScore.FIELDS_COUNT > 0) {
      currentIndex.addIndexStructure(
          "lexicon-valuefactory",
          FieldLexiconEntry.Factory.class.getName(),
          "java.lang.String",
          "${index.direct.fields.count}");
    }
    /* reset the in-memory mapping of terms to term codes.*/
    TermCodes.reset();
    System.gc();
    try {
      currentIndex.flush();
    } catch (IOException ioe) {
      logger.error("Could not flush index properties: ", ioe);
    }
  }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: RelevanceFeedbackSelector.java Projeto: mvoelske/terrier

 /** {@inheritDoc} */
 public void setIndex(Index index) {
   metaIndex = index.getMetaIndex();
 }

Exemplo n.º 14

0

Exibir arquivo

Arquivo: CooccurenceMap.java Projeto: guidozuc/adcs2015-NTLM

 public void set_index(String index_path, String index_prefix) {
   this.index = Index.createIndex(index_path, index_prefix);
   this.inv = index.getInvertedIndex();
 }

Exemplo n.º 15

0

Exibir arquivo

Arquivo: CooccurenceMap.java Projeto: guidozuc/adcs2015-NTLM

  /**
   * Builds a CooccurenceMap by iterating over the vocabulary of the collection. It counts document
   * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity:
   * O(n^3) = O(d t^2) where n is the number of terms in the vocabulary Note: this currently goes
   * out of heap space on DOTGOV with 5GB of RAM allocated to the JVM
   */
  void build_full_cooccurencemap() throws IOException {

    Lexicon<String> lex = index.getLexicon();
    Iterator<Entry<String, LexiconEntry>> itw = lex.iterator();
    int prcount = 1;
    // iterating over all possible w
    while (itw.hasNext()) {
      Entry<String, LexiconEntry> lw = itw.next();
      String termw = lw.getKey();
      if (lw.getValue().getFrequency() < this.rarethreshold
          || lw.getValue().getFrequency() > this.topthreshold) continue;

      if (prcount % 1000 == 0)
        System.out.println(
            "Processing... "
                + 100.0
                    * ((double) prcount)
                    / this.index.getCollectionStatistics().getNumberOfUniqueTerms()
                + "%");
      prcount++;

      // LexiconEntry lew = lw.getValue();
      // System.out.println("analysing " + termw);
      HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>();
      if (cooccurencemap.containsKey(termw)) {
        w_cooccurence = cooccurencemap.get(termw);
        cooccurencemap.remove(termw);
      }

      Set<Integer> docsofw = occursin_binary(termw);
      Iterator<Entry<String, LexiconEntry>> itu = lex.iterator();
      while (itu.hasNext()) {
        Entry<String, LexiconEntry> lu = itu.next();
        String termu = lu.getKey();
        if (lu.getValue().getFrequency() < this.rarethreshold
            || lu.getValue().getFrequency() > this.topthreshold) continue;

        // System.out.println("\tmeasuring co-occurence with " + termu);
        // LexiconEntry leu = lu.getValue();
        Set<Integer> docsofu = occursin_binary(termu);

        Set<Integer> intersection = new HashSet<Integer>(docsofw); // use the copy constructor
        intersection.retainAll(docsofu);
        int count = intersection.size();
        if (w_cooccurence.containsKey(termu)) {
          count = count + w_cooccurence.get(termu);
          w_cooccurence.remove(termu);
        }
        w_cooccurence.put(termu, count);
        // System.out.println("\t\t"+termw + " " + termu + " = " + count);
        // System.out.println(docsofw.size() + " " + docsofu.size() + " " + diff.entriesInCommon());

        // The next bit of code instead does count frequencies
        /*
        if(docsofw.size() <= docsofu.size()) {
        	 for (Integer docidw: docsofw.keySet())
        	    {
        	        if (docsofu.containsKey(docidw)) {
        	           //then w and u co-occur
        	        	Integer count = (Integer) Math.min(docsofw.get(docidw), docsofu.get(docidw));
        	        	if(w_cooccurence.containsKey(termu)) {
        	        		count = count + w_cooccurence.get(termu);
        	        		w_cooccurence.remove(termu);
        	        	}
        	        	w_cooccurence.put(termu, count);
        	        	System.out.println("\t\t"+termw + " " + termu + " = " + count);
        	        }
        	    }
        }else {
        	for (Integer docidu: docsofu.keySet())
            {
                if (docsofw.containsKey(docidu)) {
        	           //then w and u co-occur
        	        	Integer count = (Integer) Math.min(docsofw.get(docidu), docsofu.get(docidu));
        	        	if(w_cooccurence.containsKey(termu)) {
        	        		count = count + w_cooccurence.get(termu);
        	        		w_cooccurence.remove(termu);
        	        	}
        	        	w_cooccurence.put(termu, count);
        	        	System.out.println("\t\t"+termw + " " + termu + " = " + count);
        	        }
            }
        }*/

      }

      cooccurencemap.put(termw, w_cooccurence);
      // System.out.println(termw + ": " + w_cooccurence);
    }
  }