public Accumulator_cascade[] rank_cascade() { // point to next position in keptDocs array that hasn't been filled int indexCntKeptDocs = 0; // Clear priority queue. mSortedAccumulators.clear(); // Cliques associated with the MRF. List<Clique> cliques = mMRF.getCliques(); if (cliques.size() == 0) { System.out.println("Shouldn't have size 0"); System.exit(-1); } // Current accumulator. Accumulator_cascade a = mAccumulators[0]; /* // Initialize the MRF. try { mMRF.initialize(); } catch (ConfigurationException e) { sLogger.error("Error initializing MRF. Aborting ranking!"); return null; } */ // Maximum possible score that this MRF can achieve. float mrfMaxScore = 0.0f; for (Clique c : cliques) { if (!((((Clique_cascade) c).getParamID()).equals("termWt"))) { System.out.println( "In this faster cascade implementation, first stage must be term in order to get positions[] values! " + ((Clique_cascade) c).getParamID()); System.exit(-1); } mrfMaxScore += c.getMaxScore(); } // Sort cliques according to their max scores. Collections.sort(cliques, mMaxScoreComparator); // Score that must be achieved to enter result set. double scoreThreshold = Double.NEGATIVE_INFINITY; // Offset into document set we're currently at (if applicable). int docsetOffset = 0; int docno = 0; if (mDocSet != null) { docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE; } else { if (cascadeStage != 0) { System.out.println("Shouldn't happen. Cascade stage " + cascadeStage); System.exit(-1); } docno = mMRF.getNextCandidate(); } boolean firstTime = true; long startTime = System.currentTimeMillis(); while (docno < Integer.MAX_VALUE) { for (DocumentNode documentNode : mDocNodes) { documentNode.setDocno(docno); } // Document-at-a-time scoring. float docMaxScore = mrfMaxScore; boolean skipped = false; float score = 0.0f; // Lidan: accumulate document scores across the cascade stages if (mDocSet != null && cascadeStage != 0) { score = accumulated_scores[docsetOffset - 1]; } // for each query term, its position in a document int[][] termPositions = new int[cliques.size()][]; int document_length = -1; for (int i = 0; i < cliques.size(); i++) { // Current clique that we're scoring. Clique c = cliques.get(i); // If there's no way that this document can enter the result set // then exit. if (firstTime) { term_to_cliqueNumber.put(c.getConcept().trim().toLowerCase(), i + ""); term_to_termCollectionFrequency.put( c.getConcept().trim().toLowerCase(), ((Clique_cascade) c).termCollectionCF() + ""); term_to_termDF.put( c.getConcept().trim().toLowerCase(), ((Clique_cascade) c).termCollectionDF() + ""); } if (score + docMaxScore <= scoreThreshold) { // Advance postings readers (but don't score). for (int j = i; j < cliques.size(); j++) { cliques.get(j).setNextCandidate(docno + 1); } skipped = true; break; } // Document independent cliques do not affect the ranking. if (!c.isDocDependent()) { continue; } // Update document score. float cliqueScore = c.getPotential(); score += c.getWeight() * cliqueScore; // Update the max score for the rest of the cliques. docMaxScore -= c.getMaxScore(); // stuff needed for document evaluation in the next stage int[] p = ((Clique_cascade) c).getPositions(); if (p != null) { termPositions[i] = Arrays.copyOf(p, p.length); document_length = ((Clique_cascade) c).getDocLen(); } } firstTime = false; // Keep track of mNumResults best accumulators. if (!skipped && score > scoreThreshold) { a.docno = docno; a.score = score; a.index_into_keptDocs = indexCntKeptDocs; keptDocLengths[indexCntKeptDocs] = document_length; mSortedAccumulators.add(a); // save positional information for each query term in the document for (int j = 0; j < termPositions.length; j++) { if (termPositions[j] != null) { keptDocs[indexCntKeptDocs][j] = Arrays.copyOf(termPositions[j], termPositions[j].length); } } if (mSortedAccumulators.size() == mNumResults + 1) { a = mSortedAccumulators.poll(); // Re-use the accumulator of the removed document // After maximum # docs been put into queue, each time a new document is added, an old // document will be ejected, use the spot freed by the ejected document to store the new // document positional info in keptDocs indexCntKeptDocs = a.index_into_keptDocs; keptDocs[indexCntKeptDocs] = new int[numQueryTerms][]; scoreThreshold = mSortedAccumulators.peek().score; } else { a = mAccumulators[ mSortedAccumulators.size()]; // Next non-used accumulator in the accumulator pool indexCntKeptDocs++; } } if (mDocSet != null) { docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE; } else { if (cascadeStage != 0) { System.out.println("Shouldn't happen. Cascade stage " + cascadeStage); System.exit(-1); } docno = mMRF.getNextCandidate(); } } // Grab the accumulators off the stack, in (reverse) order. Accumulator_cascade[] results_tmp = new Accumulator_cascade[Math.min(mNumResults, mSortedAccumulators.size())]; for (int i = 0; i < results_tmp.length; i++) { results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll(); meanScore += results_tmp[results_tmp.length - 1 - i].score; } meanScore /= results_tmp.length; Accumulator_cascade[] results = results_tmp; /* Do the sorting in rank() //if there are more stages, should sort by docno if (cnt!=cliques_all.size()){ int [] order = new int[results_tmp.length]; double [] docnos = new double[results_tmp.length]; for (int i=0; i<order.length; i++){ order[i] = i; docnos[i] = results_tmp[i].docno; } ivory.smrf.model.constrained.ConstraintModel.Quicksort(docnos, order, 0, results.length-1); results = new Accumulator_cascade[results_tmp.length]; for (int i=0; i<order.length; i++){ results[i] = results_tmp[order[i]]; } } */ long endTime = System.currentTimeMillis(); return results; }
public Accumulator[] rank() { if (mSavedResults != null) { mDocSet = new int[mSavedResults.length]; accumulated_scores = new float[mSavedResults.length]; for (int i = 0; i < mSavedResults.length; i++) { mDocSet[i] = (int) mSavedResults[i][0]; accumulated_scores[i] = mSavedResults[i][1]; } keptDocs = new int[mDocSet.length + 1][numQueryTerms][]; keptDocLengths = new int[mDocSet.length + 1]; } // Initialize the MRF ==> this will clear out postings readers cache! try { mMRF.initialize(); } catch (ConfigurationException e) { sLogger.error("Error initializing MRF. Aborting ranking!"); return null; } // Cliques associated with the MRF. cliques_all = new ArrayList(); List<Clique> cliques = mMRF.getCliques(); for (int i = 0; i < cliques.size(); i++) { cliques_all.add(cliques.get(i)); } // Cascade stage starts at 0 cascadeStage = 0; cnt = 0; String pruner = null; float pruner_param = -1; long startTime = System.currentTimeMillis(); int termMatches = 0; while (cnt != cliques_all.size()) { // if not have gone thru all cascade stages float subTotal_cascadeCost = 0; long endTime = System.currentTimeMillis(); startTime = System.currentTimeMillis(); if (cascadeStage < 1) { // only call once, then use keptDocs[][][] mMRF.removeAllCliques(); for (Clique c : cliques_all) { int cs = ((Clique_cascade) c).getCascadeStage(); if (cascadeStage == cs) { // c.resetPostingsListReader(); mMRF.addClique(c); cnt++; // mNumResults = c.getNumResults(); pruner = ((Clique_cascade) c).getPruner(); pruner_param = ((Clique_cascade) c).getPruner_param(); if (cascadeStage == 0) { int numDocs = Integer.MAX_VALUE; if (mDocSet == null) { try { // c.getNumberOfPostings() is not supported for bigram postings readers numDocs = ((Clique_cascade) c).getNumberOfPostings(); } catch (Exception e) { } // (not) ignore cost of first stage from the cost model subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost * numDocs; } else { subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost; } } else { subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost; } } } if (mDocSet != null) { // Lidan: mDocSet[] & accumulated_scores[] should be sorted by doc scores! // Lidan: this method opereates on mDocSet[] & accumulated_scores[]! pruneDocuments(pruner, pruner_param); // Lidan: will score all documents in the retained documenet set mNumResults = mDocSet.length; sortDocumentsByDocnos(); // Cost = cost of applying the feature on the retained documents after pruning subTotal_cascadeCost = subTotal_cascadeCost * mNumResults; } else { // Lidan: first cascade stage, just output 20000 documents mNumResults = numOutputs_firstStage; // 20000; if (cascadeStage != 0) { System.out.println("Should be the first stage here!"); System.exit(-1); } } // Create single pool of reusable accumulators. mAccumulators = new Accumulator_cascade[mNumResults + 1]; for (int i = 0; i < mNumResults + 1; i++) { mAccumulators[i] = new Accumulator_cascade(0, 0.0f); } results = rank_cascade(); cascadeStage++; } else { String featureID = null; String scoringFunction = null; int mSize = -1; String[][] concepts_this_stage = new String[cliques_all.size()][]; float[] clique_wgts = new float[concepts_this_stage.length]; int cntConcepts = 0; for (Clique c : cliques_all) { int cs = ((Clique_cascade) c).getCascadeStage(); if (cascadeStage == cs) { cnt++; pruner = ((Clique_cascade) c).getPruner(); pruner_param = ((Clique_cascade) c).getPruner_param(); featureID = ((Clique_cascade) c).getParamID().trim(); // termWt, orderedWt, unorderedWt scoringFunction = ((Clique_cascade) c).getScoringFunctionName(); // dirichlet, bm25 mSize = ((Clique_cascade) c).getWindowSize(); // window width if (mSize == -1 && !(featureID.equals("termWt"))) { System.out.println("Only term features don't support getWindowSize()! " + featureID); System.exit(-1); } concepts_this_stage[cntConcepts] = ((Clique_cascade) c).getSingleTerms(); clique_wgts[cntConcepts] = c.getWeight(); cntConcepts++; subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost; } } // for use in pruning // score-based float max_score = results[0].score; float min_score = results[results.length - 1].score; float score_threshold = (max_score - min_score) * pruner_param + min_score; float mean_max_score_threshold = pruner_param * max_score + (1.0f - pruner_param) * meanScore; // rank-based int retainSize = (int) ((1.0 - pruner_param) * ((double) (results.length))); int keepVal = results.length; int size = 0; // Clear priority queue. mSortedAccumulators.clear(); float[] termCollectionFreqs = new float[cntConcepts]; float[] termDFs = new float[cntConcepts]; int[][] termIndexes = new int[cntConcepts][]; float sumScore = 0; for (int j = 0; j < cntConcepts; j++) { // String c = concepts_this_stage[j]; String[] singleTerms = concepts_this_stage[j]; // c.split("\\s+"); int termIndex1 = Integer.parseInt((String) (term_to_cliqueNumber.get(singleTerms[0]))); if (featureID.indexOf("termWt") != -1) { float termCollectionFreq = Float.parseFloat((String) (term_to_termCollectionFrequency.get(singleTerms[0]))); termCollectionFreqs[j] = termCollectionFreq; float termDF = Float.parseFloat((String) (term_to_termDF.get(singleTerms[0]))); termDFs[j] = termDF; termIndexes[j] = new int[1]; termIndexes[j][0] = termIndex1; if (singleTerms.length != 1) { System.out.println("Should have length 1 " + singleTerms.length); System.exit(-1); } } else { int termIndex2 = Integer.parseInt((String) (term_to_cliqueNumber.get(singleTerms[1]))); termIndexes[j] = new int[2]; termIndexes[j][0] = termIndex1; termIndexes[j][1] = termIndex2; if (singleTerms.length != 2) { System.out.println("Should have length 2 " + singleTerms.length); System.exit(-1); } } } startTime = System.currentTimeMillis(); // iterate over results documents, which are sorted in scores for (int i = 0; i < results.length; i++) { // pruning, if okay, scoring, update pruning stats for next cascade stage boolean passedPruning = false; if (pruner.equals("rank")) { if (i < retainSize) { passedPruning = true; } else { if (size < mK && mK != defaultNumDocs) { passedPruning = true; } else { break; } } } else if (pruner.equals("score")) { if (results[i].score > score_threshold) { passedPruning = true; } else { if (size < mK && mK != defaultNumDocs) { passedPruning = true; } else { break; } } } else if (pruner.equals("mean-max")) { if (results[i].score > mean_max_score_threshold) { passedPruning = true; } else { if (size < mK && mK != defaultNumDocs) { passedPruning = true; } else { break; } } } else if (pruner.equals("z-score")) { float z_score = (results[i].score - meanScore) / stddev; } else { // System.out.println("Not supported pruner! "+pruner); } if (passedPruning) { size++; int docIndex = results[i].index_into_keptDocs; int docLen = keptDocLengths[docIndex]; float docScore_cascade = 0; for (int j = 0; j < cntConcepts; j++) { // String c = concepts_this_stage[j]; String[] singleTerms = concepts_this_stage[j]; // c.split("\\s+"); if (featureID.equals("termWt")) { int termIndex1 = termIndexes[j][ 0]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[0]))); int[] positions1 = keptDocs[docIndex][termIndex1]; int tf = 0; if (positions1 != null) { tf = positions1.length; } float termCollectionFreq = termCollectionFreqs[ j]; // Float.parseFloat((String)(term_to_termCollectionFrequency.get(singleTerms[0]))); float termDF = termDFs[j]; docScore_cascade += clique_wgts[j] * getScore(tf, docLen, termCollectionFreq, termDF, scoringFunction); } else { // term proximity // merge into a single stream and compute matches. Assume there are only two // terms!!! int termIndex1 = termIndexes[j][ 0]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[0]))); int termIndex2 = termIndexes[j][ 1]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[1]))); int[] positions1 = keptDocs[docIndex][termIndex1]; int[] positions2 = keptDocs[docIndex][termIndex2]; int matches = 0; if (positions1 != null && positions2 != null) { // both query terms are in the doc termMatches++; int[] ids = new int[positions1.length]; Arrays.fill(ids, 0); int length = positions1.length; int length2 = positions2.length; int[] newPositions = new int[length + length2]; int[] newIds = new int[length + length2]; int posA = 0; int posB = 0; int ii = 0; while (ii < length + length2) { if (posB == length2 || posA < length && positions1[posA] <= positions2[posB]) { newPositions[ii] = positions1[posA]; newIds[ii] = ids[posA]; posA++; } else { newPositions[ii] = positions2[posB]; newIds[ii] = 1; posB++; } ii++; } int[] positions = newPositions; ids = newIds; BitSet mMatchedIds = new BitSet(2); // Assume there are only two terms!!! if (featureID.equals("orderedWt")) { for (ii = 0; ii < positions.length; ii++) { mMatchedIds.clear(); int maxGap = 0; boolean ordered = true; mMatchedIds.set(ids[ii]); int matchedIDCounts = 1; int lastMatchedID = ids[ii]; int lastMatchedPos = positions[ii]; for (int jj = ii + 1; jj < positions.length; jj++) { int curID = ids[jj]; int curPos = positions[jj]; if (!mMatchedIds.get(curID)) { mMatchedIds.set(curID); matchedIDCounts++; if (curID < lastMatchedID) { ordered = false; } if (curPos - lastMatchedPos > maxGap) { maxGap = curPos - lastMatchedPos; } } // stop looking if the maximum gap is too large // or the terms appear out of order if (maxGap > mSize || !ordered) { break; } // did we match all the terms, and in order? if (matchedIDCounts == 2 && ordered) { matches++; break; } } } } else if (featureID.equals("unorderedWt")) { for (ii = 0; ii < positions.length; ii++) { mMatchedIds.clear(); mMatchedIds.set(ids[ii]); int matchedIDCounts = 1; int startPos = positions[ii]; for (int jj = ii + 1; jj < positions.length; jj++) { int curID = ids[jj]; int curPos = positions[jj]; int windowSize = curPos - startPos + 1; if (!mMatchedIds.get(curID)) { mMatchedIds.set(curID); matchedIDCounts++; } // stop looking if we've exceeded the maximum window size if (windowSize > mSize) { break; } // did we match all the terms? if (matchedIDCounts == 2) { matches++; break; } } } } else { System.out.println("Invalid featureID " + featureID); System.exit(-1); } } // end if this is a match, i.e., both query terms are in the doc float s = getScore( matches, docLen, RetrievalEnvironment.mDefaultCf, (float) RetrievalEnvironment.mDefaultDf, scoringFunction); docScore_cascade += clique_wgts[j] * s; } // end else it's proximity feature } // end for (each concept) // accumulate doc score in results[i] across cascade stages results[i].score += docScore_cascade; mSortedAccumulators.add(results[i]); sumScore += results[i].score; } // end if passed pruning } // end iterating over docs endTime = System.currentTimeMillis(); // order based on new scores in results[], put into priority queue if (size != mSortedAccumulators.size()) { System.out.println( "They should be equal right here " + size + " " + mSortedAccumulators.size()); System.exit(-1); } results_tmp = new Accumulator_cascade[size]; // meanScore = 0; meanScore = sumScore / (float) size; // update stats for use in pruning in next cascade stage stddev = 0; for (int i = 0; i < results_tmp.length; i++) { results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll(); // meanScore += results_tmp[results_tmp.length - 1 - i].score; //Lidan: before it was // like this, when not doing z-score stddev += (results_tmp[results_tmp.length - 1 - i].score - meanScore) * (results_tmp[results_tmp.length - 1 - i].score - meanScore); } results = results_tmp; stddev = (float) Math.sqrt(stddev); // Create single pool of reusable accumulators. // Use mNumResults from prev iteration, since we don't know how many docs are kept until // we're done iterating through the documents // int retainSize = 0; // mAccumulators = new Accumulator[mNumResults + 1]; // for (int i = 0; i < mNumResults + 1; i++) { // mAccumulators[i] = new Accumulator(0, 0.0f); // } cascadeStage++; subTotal_cascadeCost = subTotal_cascadeCost * size; } // end if not first stage cascadeCost += subTotal_cascadeCost; } // end while long endTime = System.currentTimeMillis(); Accumulator_cascade[] results_return = results; if (results.length > mK) { // RetrievalEnvironment.mCascade_K){ results_return = new Accumulator_cascade[mK]; // RetrievalEnvironment.mCascade_K]; for (int i = 0; i < mK; i++) { // RetrievalEnvironment.mCascade_K; i++){ results_return[i] = new Accumulator_cascade(results[i].docno, results[i].score); // results_return[i].docno = results[i].docno; // results_return[i].score = results[i].score; } } return results_return; }