예제 #1
0
  public Accumulator_cascade[] rank_cascade() {

    // point to next position in keptDocs array that hasn't been filled
    int indexCntKeptDocs = 0;

    // Clear priority queue.
    mSortedAccumulators.clear();

    // Cliques associated with the MRF.
    List<Clique> cliques = mMRF.getCliques();

    if (cliques.size() == 0) {
      System.out.println("Shouldn't have size 0");
      System.exit(-1);
    }

    // Current accumulator.
    Accumulator_cascade a = mAccumulators[0];

    /*
    // Initialize the MRF.
    try {
    	mMRF.initialize();
    } catch (ConfigurationException e) {
    	sLogger.error("Error initializing MRF. Aborting ranking!");
    	return null;
    }
    */

    // Maximum possible score that this MRF can achieve.
    float mrfMaxScore = 0.0f;
    for (Clique c : cliques) {
      if (!((((Clique_cascade) c).getParamID()).equals("termWt"))) {
        System.out.println(
            "In this faster cascade implementation, first stage must be term in order to get positions[] values! "
                + ((Clique_cascade) c).getParamID());
        System.exit(-1);
      }
      mrfMaxScore += c.getMaxScore();
    }

    // Sort cliques according to their max scores.
    Collections.sort(cliques, mMaxScoreComparator);

    // Score that must be achieved to enter result set.
    double scoreThreshold = Double.NEGATIVE_INFINITY;

    // Offset into document set we're currently at (if applicable).
    int docsetOffset = 0;

    int docno = 0;
    if (mDocSet != null) {
      docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE;
    } else {
      if (cascadeStage != 0) {
        System.out.println("Shouldn't happen. Cascade stage " + cascadeStage);
        System.exit(-1);
      }

      docno = mMRF.getNextCandidate();
    }

    boolean firstTime = true;

    long startTime = System.currentTimeMillis();

    while (docno < Integer.MAX_VALUE) {
      for (DocumentNode documentNode : mDocNodes) {
        documentNode.setDocno(docno);
      }

      // Document-at-a-time scoring.
      float docMaxScore = mrfMaxScore;
      boolean skipped = false;

      float score = 0.0f;

      // Lidan: accumulate document scores across the cascade stages
      if (mDocSet != null && cascadeStage != 0) {
        score = accumulated_scores[docsetOffset - 1];
      }

      // for each query term, its position in a document
      int[][] termPositions = new int[cliques.size()][];
      int document_length = -1;

      for (int i = 0; i < cliques.size(); i++) {

        // Current clique that we're scoring.
        Clique c = cliques.get(i);

        // If there's no way that this document can enter the result set
        // then exit.

        if (firstTime) {
          term_to_cliqueNumber.put(c.getConcept().trim().toLowerCase(), i + "");
          term_to_termCollectionFrequency.put(
              c.getConcept().trim().toLowerCase(), ((Clique_cascade) c).termCollectionCF() + "");
          term_to_termDF.put(
              c.getConcept().trim().toLowerCase(), ((Clique_cascade) c).termCollectionDF() + "");
        }

        if (score + docMaxScore <= scoreThreshold) {
          // Advance postings readers (but don't score).
          for (int j = i; j < cliques.size(); j++) {
            cliques.get(j).setNextCandidate(docno + 1);
          }
          skipped = true;

          break;
        }

        // Document independent cliques do not affect the ranking.
        if (!c.isDocDependent()) {
          continue;
        }

        // Update document score.
        float cliqueScore = c.getPotential();
        score += c.getWeight() * cliqueScore;

        // Update the max score for the rest of the cliques.
        docMaxScore -= c.getMaxScore();

        // stuff needed for document evaluation in the next stage
        int[] p = ((Clique_cascade) c).getPositions();

        if (p != null) {

          termPositions[i] = Arrays.copyOf(p, p.length);

          document_length = ((Clique_cascade) c).getDocLen();
        }
      }

      firstTime = false;

      // Keep track of mNumResults best accumulators.
      if (!skipped && score > scoreThreshold) {
        a.docno = docno;
        a.score = score;
        a.index_into_keptDocs = indexCntKeptDocs;
        keptDocLengths[indexCntKeptDocs] = document_length;

        mSortedAccumulators.add(a);

        // save positional information for each query term in the document
        for (int j = 0; j < termPositions.length; j++) {

          if (termPositions[j] != null) {
            keptDocs[indexCntKeptDocs][j] =
                Arrays.copyOf(termPositions[j], termPositions[j].length);
          }
        }

        if (mSortedAccumulators.size() == mNumResults + 1) {
          a = mSortedAccumulators.poll(); // Re-use the accumulator of the removed document

          // After maximum # docs been put into queue, each time a new document is added, an old
          // document will be ejected, use the spot freed by the ejected document to store the new
          // document positional info in keptDocs

          indexCntKeptDocs = a.index_into_keptDocs;
          keptDocs[indexCntKeptDocs] = new int[numQueryTerms][];

          scoreThreshold = mSortedAccumulators.peek().score;

        } else {
          a =
              mAccumulators[
                  mSortedAccumulators.size()]; // Next non-used accumulator in the accumulator pool
          indexCntKeptDocs++;
        }
      }

      if (mDocSet != null) {
        docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE;
      } else {
        if (cascadeStage != 0) {
          System.out.println("Shouldn't happen. Cascade stage " + cascadeStage);
          System.exit(-1);
        }

        docno = mMRF.getNextCandidate();
      }
    }

    // Grab the accumulators off the stack, in (reverse) order.
    Accumulator_cascade[] results_tmp =
        new Accumulator_cascade[Math.min(mNumResults, mSortedAccumulators.size())];

    for (int i = 0; i < results_tmp.length; i++) {
      results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll();
      meanScore += results_tmp[results_tmp.length - 1 - i].score;
    }

    meanScore /= results_tmp.length;

    Accumulator_cascade[] results = results_tmp;

    /* Do the sorting in rank()
    //if there are more stages, should sort by docno
    if (cnt!=cliques_all.size()){
    	int [] order = new int[results_tmp.length];
    	double [] docnos = new double[results_tmp.length];
    	for (int i=0; i<order.length; i++){
    		order[i] = i;
    		docnos[i] = results_tmp[i].docno;
    	}

    	ivory.smrf.model.constrained.ConstraintModel.Quicksort(docnos, order, 0, results.length-1);
    	results = new Accumulator_cascade[results_tmp.length];
    	for (int i=0; i<order.length; i++){
    		results[i] = results_tmp[order[i]];
    	}
    }
    */

    long endTime = System.currentTimeMillis();

    return results;
  }
예제 #2
0
  public Accumulator[] rank() {

    if (mSavedResults != null) {
      mDocSet = new int[mSavedResults.length];
      accumulated_scores = new float[mSavedResults.length];

      for (int i = 0; i < mSavedResults.length; i++) {
        mDocSet[i] = (int) mSavedResults[i][0];
        accumulated_scores[i] = mSavedResults[i][1];
      }

      keptDocs = new int[mDocSet.length + 1][numQueryTerms][];
      keptDocLengths = new int[mDocSet.length + 1];
    }

    // Initialize the MRF ==> this will clear out postings readers cache!
    try {
      mMRF.initialize();
    } catch (ConfigurationException e) {
      sLogger.error("Error initializing MRF. Aborting ranking!");
      return null;
    }

    // Cliques associated with the MRF.
    cliques_all = new ArrayList();
    List<Clique> cliques = mMRF.getCliques();
    for (int i = 0; i < cliques.size(); i++) {
      cliques_all.add(cliques.get(i));
    }

    // Cascade stage starts at 0
    cascadeStage = 0;

    cnt = 0;

    String pruner = null;
    float pruner_param = -1;

    long startTime = System.currentTimeMillis();

    int termMatches = 0;

    while (cnt != cliques_all.size()) { // if not have gone thru all cascade stages

      float subTotal_cascadeCost = 0;

      long endTime = System.currentTimeMillis();

      startTime = System.currentTimeMillis();

      if (cascadeStage < 1) { // only call once, then use keptDocs[][][]
        mMRF.removeAllCliques();

        for (Clique c : cliques_all) {
          int cs = ((Clique_cascade) c).getCascadeStage();
          if (cascadeStage == cs) {

            // c.resetPostingsListReader();
            mMRF.addClique(c);
            cnt++;
            // mNumResults = c.getNumResults();
            pruner = ((Clique_cascade) c).getPruner();
            pruner_param = ((Clique_cascade) c).getPruner_param();

            if (cascadeStage == 0) {

              int numDocs = Integer.MAX_VALUE;

              if (mDocSet == null) {
                try { // c.getNumberOfPostings() is not supported for bigram postings readers

                  numDocs = ((Clique_cascade) c).getNumberOfPostings();
                } catch (Exception e) {
                }

                // (not) ignore cost of first stage from the cost model
                subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost * numDocs;
              } else {
                subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost;
              }
            } else {
              subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost;
            }
          }
        }

        if (mDocSet != null) {

          // Lidan: mDocSet[] & accumulated_scores[] should be sorted by doc scores!
          // Lidan: this method opereates on mDocSet[] & accumulated_scores[]!
          pruneDocuments(pruner, pruner_param);

          // Lidan: will score all documents in the retained documenet set
          mNumResults = mDocSet.length;

          sortDocumentsByDocnos();

          // Cost = cost of applying the feature on the retained documents after pruning
          subTotal_cascadeCost = subTotal_cascadeCost * mNumResults;

        } else {
          // Lidan: first cascade stage, just output 20000 documents

          mNumResults = numOutputs_firstStage; // 20000;

          if (cascadeStage != 0) {
            System.out.println("Should be the first stage here!");
            System.exit(-1);
          }
        }

        // Create single pool of reusable accumulators.
        mAccumulators = new Accumulator_cascade[mNumResults + 1];
        for (int i = 0; i < mNumResults + 1; i++) {
          mAccumulators[i] = new Accumulator_cascade(0, 0.0f);
        }

        results = rank_cascade();

        cascadeStage++;
      } else {
        String featureID = null;
        String scoringFunction = null;
        int mSize = -1;
        String[][] concepts_this_stage = new String[cliques_all.size()][];
        float[] clique_wgts = new float[concepts_this_stage.length];

        int cntConcepts = 0;

        for (Clique c : cliques_all) {
          int cs = ((Clique_cascade) c).getCascadeStage();
          if (cascadeStage == cs) {
            cnt++;
            pruner = ((Clique_cascade) c).getPruner();
            pruner_param = ((Clique_cascade) c).getPruner_param();

            featureID =
                ((Clique_cascade) c).getParamID().trim(); // termWt, orderedWt, unorderedWt				
            scoringFunction = ((Clique_cascade) c).getScoringFunctionName(); // dirichlet, bm25

            mSize = ((Clique_cascade) c).getWindowSize(); // window width
            if (mSize == -1 && !(featureID.equals("termWt"))) {
              System.out.println("Only term features don't support getWindowSize()! " + featureID);
              System.exit(-1);
            }
            concepts_this_stage[cntConcepts] = ((Clique_cascade) c).getSingleTerms();
            clique_wgts[cntConcepts] = c.getWeight();

            cntConcepts++;
            subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost;
          }
        }

        // for use in pruning

        // score-based
        float max_score = results[0].score;
        float min_score = results[results.length - 1].score;
        float score_threshold = (max_score - min_score) * pruner_param + min_score;

        float mean_max_score_threshold =
            pruner_param * max_score + (1.0f - pruner_param) * meanScore;

        // rank-based
        int retainSize = (int) ((1.0 - pruner_param) * ((double) (results.length)));

        int keepVal = results.length;

        int size = 0;

        // Clear priority queue.
        mSortedAccumulators.clear();

        float[] termCollectionFreqs = new float[cntConcepts];
        float[] termDFs = new float[cntConcepts];
        int[][] termIndexes = new int[cntConcepts][];

        float sumScore = 0;

        for (int j = 0; j < cntConcepts; j++) {

          // String c = concepts_this_stage[j];

          String[] singleTerms = concepts_this_stage[j]; // c.split("\\s+");

          int termIndex1 = Integer.parseInt((String) (term_to_cliqueNumber.get(singleTerms[0])));

          if (featureID.indexOf("termWt") != -1) {
            float termCollectionFreq =
                Float.parseFloat((String) (term_to_termCollectionFrequency.get(singleTerms[0])));
            termCollectionFreqs[j] = termCollectionFreq;

            float termDF = Float.parseFloat((String) (term_to_termDF.get(singleTerms[0])));
            termDFs[j] = termDF;

            termIndexes[j] = new int[1];
            termIndexes[j][0] = termIndex1;

            if (singleTerms.length != 1) {
              System.out.println("Should have length 1 " + singleTerms.length);
              System.exit(-1);
            }
          } else {
            int termIndex2 = Integer.parseInt((String) (term_to_cliqueNumber.get(singleTerms[1])));

            termIndexes[j] = new int[2];
            termIndexes[j][0] = termIndex1;
            termIndexes[j][1] = termIndex2;

            if (singleTerms.length != 2) {
              System.out.println("Should have length 2 " + singleTerms.length);
              System.exit(-1);
            }
          }
        }

        startTime = System.currentTimeMillis();

        // iterate over results documents, which are sorted in scores
        for (int i = 0; i < results.length; i++) {
          // pruning, if okay, scoring, update pruning stats for next cascade stage

          boolean passedPruning = false;
          if (pruner.equals("rank")) {
            if (i < retainSize) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruner.equals("score")) {
            if (results[i].score > score_threshold) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruner.equals("mean-max")) {
            if (results[i].score > mean_max_score_threshold) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruner.equals("z-score")) {
            float z_score = (results[i].score - meanScore) / stddev;
          } else {
            // System.out.println("Not supported pruner! "+pruner);
          }

          if (passedPruning) {

            size++;

            int docIndex = results[i].index_into_keptDocs;

            int docLen = keptDocLengths[docIndex];

            float docScore_cascade = 0;

            for (int j = 0; j < cntConcepts; j++) {
              // String c = concepts_this_stage[j];
              String[] singleTerms = concepts_this_stage[j]; // c.split("\\s+");

              if (featureID.equals("termWt")) {

                int termIndex1 =
                    termIndexes[j][
                        0]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[0])));
                int[] positions1 = keptDocs[docIndex][termIndex1];

                int tf = 0;

                if (positions1 != null) {
                  tf = positions1.length;
                }
                float termCollectionFreq =
                    termCollectionFreqs[
                        j]; // Float.parseFloat((String)(term_to_termCollectionFrequency.get(singleTerms[0])));
                float termDF = termDFs[j];

                docScore_cascade +=
                    clique_wgts[j]
                        * getScore(tf, docLen, termCollectionFreq, termDF, scoringFunction);

              } else { // term proximity

                // merge into a single stream and compute matches. Assume there are only two
                // terms!!!

                int termIndex1 =
                    termIndexes[j][
                        0]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[0])));
                int termIndex2 =
                    termIndexes[j][
                        1]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[1])));

                int[] positions1 = keptDocs[docIndex][termIndex1];
                int[] positions2 = keptDocs[docIndex][termIndex2];

                int matches = 0;

                if (positions1 != null && positions2 != null) { // both query terms are in the doc

                  termMatches++;
                  int[] ids = new int[positions1.length];
                  Arrays.fill(ids, 0);
                  int length = positions1.length;

                  int length2 = positions2.length;

                  int[] newPositions = new int[length + length2];
                  int[] newIds = new int[length + length2];

                  int posA = 0;
                  int posB = 0;

                  int ii = 0;
                  while (ii < length + length2) {
                    if (posB == length2 || posA < length && positions1[posA] <= positions2[posB]) {
                      newPositions[ii] = positions1[posA];
                      newIds[ii] = ids[posA];
                      posA++;
                    } else {
                      newPositions[ii] = positions2[posB];
                      newIds[ii] = 1;
                      posB++;
                    }
                    ii++;
                  }

                  int[] positions = newPositions;
                  ids = newIds;

                  BitSet mMatchedIds = new BitSet(2); // Assume there are only two terms!!!

                  if (featureID.equals("orderedWt")) {

                    for (ii = 0; ii < positions.length; ii++) {
                      mMatchedIds.clear();
                      int maxGap = 0;
                      boolean ordered = true;
                      mMatchedIds.set(ids[ii]);
                      int matchedIDCounts = 1;
                      int lastMatchedID = ids[ii];
                      int lastMatchedPos = positions[ii];

                      for (int jj = ii + 1; jj < positions.length; jj++) {
                        int curID = ids[jj];
                        int curPos = positions[jj];
                        if (!mMatchedIds.get(curID)) {
                          mMatchedIds.set(curID);
                          matchedIDCounts++;
                          if (curID < lastMatchedID) {
                            ordered = false;
                          }
                          if (curPos - lastMatchedPos > maxGap) {
                            maxGap = curPos - lastMatchedPos;
                          }
                        }
                        // stop looking if the maximum gap is too large
                        // or the terms appear out of order
                        if (maxGap > mSize || !ordered) {
                          break;
                        }
                        // did we match all the terms, and in order?
                        if (matchedIDCounts == 2 && ordered) {
                          matches++;
                          break;
                        }
                      }
                    }
                  } else if (featureID.equals("unorderedWt")) {

                    for (ii = 0; ii < positions.length; ii++) {
                      mMatchedIds.clear();

                      mMatchedIds.set(ids[ii]);
                      int matchedIDCounts = 1;
                      int startPos = positions[ii];

                      for (int jj = ii + 1; jj < positions.length; jj++) {
                        int curID = ids[jj];
                        int curPos = positions[jj];
                        int windowSize = curPos - startPos + 1;

                        if (!mMatchedIds.get(curID)) {
                          mMatchedIds.set(curID);
                          matchedIDCounts++;
                        }
                        // stop looking if we've exceeded the maximum window size
                        if (windowSize > mSize) {
                          break;
                        }
                        // did we match all the terms?
                        if (matchedIDCounts == 2) {
                          matches++;
                          break;
                        }
                      }
                    }
                  } else {
                    System.out.println("Invalid featureID " + featureID);
                    System.exit(-1);
                  }
                } // end if this is a match, i.e., both query terms are in the doc

                float s =
                    getScore(
                        matches,
                        docLen,
                        RetrievalEnvironment.mDefaultCf,
                        (float) RetrievalEnvironment.mDefaultDf,
                        scoringFunction);
                docScore_cascade += clique_wgts[j] * s;
              } // end else it's proximity feature
            } // end for (each concept)

            // accumulate doc score in results[i] across cascade stages
            results[i].score += docScore_cascade;

            mSortedAccumulators.add(results[i]);

            sumScore += results[i].score;
          } // end if passed pruning
        } // end iterating over docs

        endTime = System.currentTimeMillis();

        // order based on new scores in results[], put into priority queue

        if (size != mSortedAccumulators.size()) {
          System.out.println(
              "They should be equal right here " + size + " " + mSortedAccumulators.size());
          System.exit(-1);
        }

        results_tmp = new Accumulator_cascade[size];

        // meanScore = 0;

        meanScore =
            sumScore / (float) size; // update stats for use in pruning in next cascade stage
        stddev = 0;

        for (int i = 0; i < results_tmp.length; i++) {
          results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll();
          // meanScore += results_tmp[results_tmp.length - 1 - i].score;			//Lidan: before it was
          // like this, when not doing z-score

          stddev +=
              (results_tmp[results_tmp.length - 1 - i].score - meanScore)
                  * (results_tmp[results_tmp.length - 1 - i].score - meanScore);
        }
        results = results_tmp;

        stddev = (float) Math.sqrt(stddev);

        // Create single pool of reusable accumulators.
        // Use mNumResults from prev iteration, since we don't know how many docs are kept until
        // we're done iterating through the documents
        // int retainSize = 0;
        // mAccumulators = new Accumulator[mNumResults + 1];
        // for (int i = 0; i < mNumResults + 1; i++) {
        //	mAccumulators[i] = new Accumulator(0, 0.0f);
        // }

        cascadeStage++;

        subTotal_cascadeCost = subTotal_cascadeCost * size;
      } // end if not first stage

      cascadeCost += subTotal_cascadeCost;
    } // end while

    long endTime = System.currentTimeMillis();

    Accumulator_cascade[] results_return = results;

    if (results.length > mK) { // RetrievalEnvironment.mCascade_K){

      results_return = new Accumulator_cascade[mK]; // RetrievalEnvironment.mCascade_K];

      for (int i = 0; i < mK; i++) { // RetrievalEnvironment.mCascade_K; i++){
        results_return[i] = new Accumulator_cascade(results[i].docno, results[i].score);
        // results_return[i].docno = results[i].docno;
        // results_return[i].score = results[i].score;
      }
    }

    return results_return;
  }