Beispiel #1
0
 public static List<Integer> intersectTwoPostingLists(List<Integer> l1, List<Integer> l2) {
   List<Integer> intersection = PostingList.createList();
   if (l1.isEmpty() || l2.isEmpty()) return intersection;
   int idx1 = 0, idx2 = 0;
   while (idx1 < l1.size() && idx2 < l2.size()) {
     if (l1.get(idx1).equals(l2.get(idx2))) {
       intersection.add(l1.get(idx1));
       idx1++;
       idx2++;
     } else if (l1.get(idx1).compareTo(l2.get(idx2)) < 0) {
       idx1++;
     } else {
       idx2++;
     }
   }
   return intersection;
 }
Beispiel #2
0
  public RankedResults[] retrieveRankedDocuments(String query) {

    String[] terms = getQueryTerms(query);
    Map<Integer, Doc> documentList = obIndex.getDocsList();

    double avgDocLen = obIndex.getAvgDocLength();
    long collectionSize = obIndex.getTotalNoFiles();

    Map<Integer, Double> weight1Map = new TreeMap<Integer, Double>();
    Map<Integer, Double> weight2Map = new TreeMap<Integer, Double>();

    for (String term : terms) {
      PostingList postingList = obIndex.getPostingList(term);

      if (postingList == null) {
        System.out.println("Term: " + term + " doesn't exist in document collection");
        continue;
      }

      int df = postingList.getNumberofDocs();

      for (PostingListCell postingListCell : postingList.getDocsList()) {

        Doc doc = documentList.get(postingListCell.getDocId());

        int maxTf = doc.getMaxTermFrequency();
        int docLen = doc.getTotalTerms();

        int tf = postingListCell.getStemFreq();

        double w1 =
            (0.4 + 0.6 * Math.log(tf + 0.5) / Math.log(maxTf + 0.5))
                * (Math.log(collectionSize / df) / Math.log(collectionSize));

        double w2 =
            0.4
                + 0.6
                    * (tf / (tf + 0.5 + 1.5 * docLen / avgDocLen))
                    * (Math.log(collectionSize / df) / Math.log(collectionSize));

        int docId = doc.getDocumentId();

        if (!weight1Map.containsKey(docId)) {
          weight1Map.put(docId, 0.0);
        }
        weight1Map.put(docId, weight1Map.get(docId) + w1);

        if (!weight2Map.containsKey(docId)) {
          weight2Map.put(docId, 0.0);
        }
        weight2Map.put(docId, weight2Map.get(docId) + w2);
      }
    }

    Comparator<Map.Entry<Integer, Double>> comparator =
        new Comparator<Map.Entry<Integer, Double>>() {

          public int compare(Map.Entry<Integer, Double> o1, Map.Entry<Integer, Double> o2) {
            return o1.getValue().equals(o2.getValue())
                ? o1.getKey().compareTo(o2.getKey())
                : o2.getValue().compareTo(o1.getValue());
          }
        };

    List<Map.Entry<Integer, Double>> weight1List =
        new ArrayList<Map.Entry<Integer, Double>>(weight1Map.entrySet());
    List<Map.Entry<Integer, Double>> weight2List =
        new ArrayList<Map.Entry<Integer, Double>>(weight2Map.entrySet());

    Collections.sort(weight1List, comparator);
    Collections.sort(weight2List, comparator);

    RankedResults w1Results = new RankedResults();
    RankedResults w2Results = new RankedResults();

    for (int i = 0; i < 10; i++) {
      if (weight1List.size() >= i) {
        Doc docForW1 = documentList.get(weight1List.get(i).getKey());
        w1Results.add(
            new Result(
                (i + 1),
                weight1List.get(i).getValue(),
                weight1List.get(i).getKey(),
                docForW1.getDocumentName(),
                docForW1.getDocumentTitle()));
      }

      if (weight2List.size() >= i) {
        Doc docForW2 = documentList.get(weight2List.get(i).getKey());
        w2Results.add(
            new Result(
                (i + 1),
                weight2List.get(i).getValue(),
                weight2List.get(i).getKey(),
                docForW2.getDocumentName(),
                docForW2.getDocumentTitle()));
      }
    }

    return new RankedResults[] {w1Results, w2Results};
  }