// returns the documents were w occurs in - binary version
 Set<Integer> occursin_binary(String v) throws IOException {
   Set<Integer> vecv = new HashSet<Integer>();
   Lexicon<String> lex = index.getLexicon();
   LexiconEntry le = lex.getLexiconEntry(v);
   IterablePosting postings = inv.getPostings(le);
   while (postings.next() != IterablePosting.EOL) {
     vecv.add(postings.getId());
   }
   return vecv;
 }
  /**
   * Builds a CooccurenceMap by iterating over the documents of the collection. It counts document
   * co-occurence, i.e. it doesn't consider the frequency of two terms in a document. Complexity:
   * O(d * t *t/2) = O(d t^2) where d is the number of documents in the collection and t is the
   * average number of terms per documents. Note that t = avg doc len
   */
  public void build_full_cooccurencemap_docversion() throws IOException {
    PostingIndex di = index.getDirectIndex();
    DocumentIndex doi = index.getDocumentIndex();
    Lexicon<String> lex = index.getLexicon();
    for (int docid = 0; docid < doi.getNumberOfDocuments(); docid++) {
      if (docid % 1000 == 0)
        System.out.println(
            "Processing... " + 100.0 * ((double) docid) / doi.getNumberOfDocuments() + "%");
      IterablePosting postings = di.getPostings(doi.getDocumentEntry(docid));
      Vector<String> seenterms = new Vector<String>();
      while (postings.next() != IterablePosting.EOL) {
        Map.Entry<String, LexiconEntry> lee = lex.getLexiconEntry(postings.getId());
        String termw = lee.getKey();
        if (lee.getValue().getFrequency() < this.rarethreshold
            || lee.getValue().getFrequency() > this.topthreshold) continue;

        HashMap<String, Integer> w_cooccurence = new HashMap<String, Integer>();
        if (this.cooccurencemap.containsKey(termw)) {
          w_cooccurence = this.cooccurencemap.get(termw);
          this.cooccurencemap.remove(termw);
        }
        Iterator<String> it = seenterms.iterator();
        while (it.hasNext()) {
          String termu = it.next();
          int count = 1;
          if (w_cooccurence.containsKey(termu)) {
            count = count + w_cooccurence.get(termu);
            w_cooccurence.remove(termu);
          }
          w_cooccurence.put(termu, count);

          // System.out.println(termw + ": " + w_cooccurence);
          // and now I need to do the symmetric
          HashMap<String, Integer> u_cooccurence = new HashMap<String, Integer>();
          if (cooccurencemap.containsKey(termu)) {
            u_cooccurence = cooccurencemap.get(termu);
            cooccurencemap.remove(termu);
          }
          int countu = 1;
          if (u_cooccurence.containsKey(termw)) {
            countu = countu + u_cooccurence.get(termw);
            u_cooccurence.remove(termw);
          }
          u_cooccurence.put(termw, count);
          cooccurencemap.put(termu, u_cooccurence);
          // System.out.println(termu + ": " + u_cooccurence);
        }

        cooccurencemap.put(termw, w_cooccurence);
        seenterms.add(termw); // I add only the termw that are within the thresholds
      }
    }
  }
  // returns the documents were w occurs in
  HashMap<Integer, Integer> occursin(String v) throws IOException {
    HashMap<Integer, Integer> docsofv = new HashMap<Integer, Integer>();

    // MetaIndex meta = index.getMetaIndex();
    Lexicon<String> lex = index.getLexicon();
    LexiconEntry lev = lex.getLexiconEntry(v);
    IterablePosting postings = inv.getPostings(lev);
    while (postings.next() != IterablePosting.EOL) {
      docsofv.put(postings.getId(), postings.getFrequency());
    }
    return docsofv;
  }
示例#4
0
  protected void assignScores(int i, AccumulatorResultSet rs, final IterablePosting postings)
      throws IOException {
    int docid;
    double score;

    short mask = 0;
    if (i < 16) mask = (short) (1 << i);

    while (postings.next() != IterablePosting.EOL) {
      score = plm.score(i);
      docid = postings.getId();
      // logger.info("Docid=" + docid + " score=" + score);
      if ((!rs.scoresMap.contains(docid)) && (score > 0.0d)) numberOfRetrievedDocuments++;
      else if ((rs.scoresMap.contains(docid)) && (score < 0.0d)) numberOfRetrievedDocuments--;

      rs.scoresMap.adjustOrPutValue(docid, score, score);
      rs.occurrencesMap.put(docid, (short) (rs.occurrencesMap.get(docid) | mask));
    }
  }
 /** Print a list of the postings to standard out */
 @Override
 public void print() {
   try {
     int entryIndex = 0;
     while (this.hasNext()) {
       IterablePosting ip = this.next();
       entryIndex += this.getEntriesSkipped();
       System.out.print(entryIndex + " ");
       while (ip.next() != IterablePosting.EOL) {
         System.out.print(ip.toString());
         System.out.print(" ");
       }
       System.out.println();
       entryIndex++;
     }
   } catch (Exception e) {
     logger.error("Error during print()", e);
   }
 }
  /**
   * @param args
   * @throws IOException
   * @throws ClassNotFoundException
   */
  public static void main(String[] args) throws IOException, ClassNotFoundException {
    CooccurenceMap coccmap = new CooccurenceMap();
    // System.setProperty("terrier.home", "/Users/zuccong/tools/terrier-4.0");

    System.setProperty("terrier.home", args[0]);

    // coccmap.set_index("/Users/zuccong/experiments/dotgov_stoplist/", "data");
    coccmap.set_index(args[1], "data");
    coccmap.setRarethreshold(500);
    coccmap.setTopthreshold(coccmap.index.getCollectionStatistics().getNumberOfDocuments() / 1000);
    coccmap.build_full_cooccurencemap_docversion();
    // coccmap.build_full_cooccurencemap();
    // coccmap.writemap("/Users/zuccong/experiments/cooccurence_dotgov_stoplist.map");
    coccmap.writemap(args[2]);
    System.out.println("Size written" + coccmap.cooccurencemap.size());

    /*
    System.out.println("Reading map from file");
    //CooccurenceMap coccmapr = coccmap.readmap("/Users/zuccong/experiments/sigir2015_nlm/cooccurence_dotgov_stoplist");
    CooccurenceMap coccmapr = coccmap.readmap(args[2]);
    //coccmapr.set_index("/Users/zuccong/experiments/dotgov_stoplist/", "data");
    coccmapr.set_index(args[1], "data");

    System.out.println("Size read " + coccmapr.cooccurencemap.size());
    */
    CooccurenceMap coccmapr = coccmap;

    // This is just a testing loop: will only examine the first 5 terms
    int count = 5;
    for (String w : coccmapr.cooccurencemap.keySet()) {
      if (count > 0) {
        count--;
        System.out.println(w);
        HashMap<String, Integer> w_cooccurence = coccmapr.cooccurencemap.get(w);
        for (String u : w_cooccurence.keySet()) {
          System.out.println("\t" + u + ": " + w_cooccurence.get(u));

          Set<Integer> vecw = new HashSet<Integer>();
          Lexicon<String> lex = coccmapr.index.getLexicon();
          LexiconEntry le = lex.getLexiconEntry(w);
          IterablePosting postings = coccmapr.inv.getPostings(le);
          while (postings.next() != IterablePosting.EOL) {
            vecw.add(postings.getId());
          }

          Set<Integer> vecu = new HashSet<Integer>();
          LexiconEntry leu = lex.getLexiconEntry(u);
          IterablePosting postingsu = coccmapr.inv.getPostings(leu);
          while (postingsu.next() != IterablePosting.EOL) {
            vecu.add(postingsu.getId());
          }
          Set<Integer> intersection = new HashSet<Integer>(vecw); // use the copy constructor
          intersection.retainAll(vecu);
          System.out.println(
              "\tintersection: "
                  + intersection.size()
                  + " size w: "
                  + vecw.size()
                  + " size u: "
                  + vecu.size());
        }
      }
    }

    System.out.println("co-occurrence(fracture,doctor) = " + coccmap.get_w_u("holiday", "meeting"));
    System.out.println("co-occurrence(doctor,fracture) = " + coccmap.get_w_u("meeting", "holiday"));

    System.out.println("co-occurrence(risk,economy) = " + coccmap.get_w_u("risk", "economy"));
    System.out.println("co-occurrence(economy,risk) = " + coccmap.get_w_u("economy", "risk"));

    System.out.println("co-occurrence(dollar,million) = " + coccmap.get_w_u("dollar", "million"));
    System.out.println("co-occurrence(million,dollar) = " + coccmap.get_w_u("million", "dollar"));
  }