Java Clique Examples

Programming Language: Java

Namespace/Package Name: ivory.smrf.model

Class/Type: Clique

Examples at hotexamples.com: 2

Java Clique - 2 examples found. These are the top rated real world Java examples of ivory.smrf.model.Clique extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getWeight(2)

getConcept(1)

getMaxScore(1)

getPotential(1)

isDocDependent(1)

Example #1

Show file

File: CascadeEval.java Project: lidan/Ivory

  public Accumulator_cascade[] rank_cascade() {

    // point to next position in keptDocs array that hasn't been filled
    int indexCntKeptDocs = 0;

    // Clear priority queue.
    mSortedAccumulators.clear();

    // Cliques associated with the MRF.
    List<Clique> cliques = mMRF.getCliques();

    if (cliques.size() == 0) {
      System.out.println("Shouldn't have size 0");
      System.exit(-1);
    }

    // Current accumulator.
    Accumulator_cascade a = mAccumulators[0];

    /*
    // Initialize the MRF.
    try {
    	mMRF.initialize();
    } catch (ConfigurationException e) {
    	sLogger.error("Error initializing MRF. Aborting ranking!");
    	return null;
    }
    */

    // Maximum possible score that this MRF can achieve.
    float mrfMaxScore = 0.0f;
    for (Clique c : cliques) {
      if (!((((Clique_cascade) c).getParamID()).equals("termWt"))) {
        System.out.println(
            "In this faster cascade implementation, first stage must be term in order to get positions[] values! "
                + ((Clique_cascade) c).getParamID());
        System.exit(-1);
      }
      mrfMaxScore += c.getMaxScore();
    }

    // Sort cliques according to their max scores.
    Collections.sort(cliques, mMaxScoreComparator);

    // Score that must be achieved to enter result set.
    double scoreThreshold = Double.NEGATIVE_INFINITY;

    // Offset into document set we're currently at (if applicable).
    int docsetOffset = 0;

    int docno = 0;
    if (mDocSet != null) {
      docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE;
    } else {
      if (cascadeStage != 0) {
        System.out.println("Shouldn't happen. Cascade stage " + cascadeStage);
        System.exit(-1);
      }

      docno = mMRF.getNextCandidate();
    }

    boolean firstTime = true;

    long startTime = System.currentTimeMillis();

    while (docno < Integer.MAX_VALUE) {
      for (DocumentNode documentNode : mDocNodes) {
        documentNode.setDocno(docno);
      }

      // Document-at-a-time scoring.
      float docMaxScore = mrfMaxScore;
      boolean skipped = false;

      float score = 0.0f;

      // Lidan: accumulate document scores across the cascade stages
      if (mDocSet != null && cascadeStage != 0) {
        score = accumulated_scores[docsetOffset - 1];
      }

      // for each query term, its position in a document
      int[][] termPositions = new int[cliques.size()][];
      int document_length = -1;

      for (int i = 0; i < cliques.size(); i++) {

        // Current clique that we're scoring.
        Clique c = cliques.get(i);

        // If there's no way that this document can enter the result set
        // then exit.

        if (firstTime) {
          term_to_cliqueNumber.put(c.getConcept().trim().toLowerCase(), i + "");
          term_to_termCollectionFrequency.put(
              c.getConcept().trim().toLowerCase(), ((Clique_cascade) c).termCollectionCF() + "");
          term_to_termDF.put(
              c.getConcept().trim().toLowerCase(), ((Clique_cascade) c).termCollectionDF() + "");
        }

        if (score + docMaxScore <= scoreThreshold) {
          // Advance postings readers (but don't score).
          for (int j = i; j < cliques.size(); j++) {
            cliques.get(j).setNextCandidate(docno + 1);
          }
          skipped = true;

          break;
        }

        // Document independent cliques do not affect the ranking.
        if (!c.isDocDependent()) {
          continue;
        }

        // Update document score.
        float cliqueScore = c.getPotential();
        score += c.getWeight() * cliqueScore;

        // Update the max score for the rest of the cliques.
        docMaxScore -= c.getMaxScore();

        // stuff needed for document evaluation in the next stage
        int[] p = ((Clique_cascade) c).getPositions();

        if (p != null) {

          termPositions[i] = Arrays.copyOf(p, p.length);

          document_length = ((Clique_cascade) c).getDocLen();
        }
      }

      firstTime = false;

      // Keep track of mNumResults best accumulators.
      if (!skipped && score > scoreThreshold) {
        a.docno = docno;
        a.score = score;
        a.index_into_keptDocs = indexCntKeptDocs;
        keptDocLengths[indexCntKeptDocs] = document_length;

        mSortedAccumulators.add(a);

        // save positional information for each query term in the document
        for (int j = 0; j < termPositions.length; j++) {

          if (termPositions[j] != null) {
            keptDocs[indexCntKeptDocs][j] =
                Arrays.copyOf(termPositions[j], termPositions[j].length);
          }
        }

        if (mSortedAccumulators.size() == mNumResults + 1) {
          a = mSortedAccumulators.poll(); // Re-use the accumulator of the removed document

          // After maximum # docs been put into queue, each time a new document is added, an old
          // document will be ejected, use the spot freed by the ejected document to store the new
          // document positional info in keptDocs

          indexCntKeptDocs = a.index_into_keptDocs;
          keptDocs[indexCntKeptDocs] = new int[numQueryTerms][];

          scoreThreshold = mSortedAccumulators.peek().score;

        } else {
          a =
              mAccumulators[
                  mSortedAccumulators.size()]; // Next non-used accumulator in the accumulator pool
          indexCntKeptDocs++;
        }
      }

      if (mDocSet != null) {
        docno = docsetOffset < mDocSet.length ? mDocSet[docsetOffset++] : Integer.MAX_VALUE;
      } else {
        if (cascadeStage != 0) {
          System.out.println("Shouldn't happen. Cascade stage " + cascadeStage);
          System.exit(-1);
        }

        docno = mMRF.getNextCandidate();
      }
    }

    // Grab the accumulators off the stack, in (reverse) order.
    Accumulator_cascade[] results_tmp =
        new Accumulator_cascade[Math.min(mNumResults, mSortedAccumulators.size())];

    for (int i = 0; i < results_tmp.length; i++) {
      results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll();
      meanScore += results_tmp[results_tmp.length - 1 - i].score;
    }

    meanScore /= results_tmp.length;

    Accumulator_cascade[] results = results_tmp;

    /* Do the sorting in rank()
    //if there are more stages, should sort by docno
    if (cnt!=cliques_all.size()){
    	int [] order = new int[results_tmp.length];
    	double [] docnos = new double[results_tmp.length];
    	for (int i=0; i<order.length; i++){
    		order[i] = i;
    		docnos[i] = results_tmp[i].docno;
    	}

    	ivory.smrf.model.constrained.ConstraintModel.Quicksort(docnos, order, 0, results.length-1);
    	results = new Accumulator_cascade[results_tmp.length];
    	for (int i=0; i<order.length; i++){
    		results[i] = results_tmp[order[i]];
    	}
    }
    */

    long endTime = System.currentTimeMillis();

    return results;
  }

Example #2

Show file

File: CascadeEval.java Project: lidan/Ivory

  public Accumulator[] rank() {

    if (mSavedResults != null) {
      mDocSet = new int[mSavedResults.length];
      accumulated_scores = new float[mSavedResults.length];

      for (int i = 0; i < mSavedResults.length; i++) {
        mDocSet[i] = (int) mSavedResults[i][0];
        accumulated_scores[i] = mSavedResults[i][1];
      }

      keptDocs = new int[mDocSet.length + 1][numQueryTerms][];
      keptDocLengths = new int[mDocSet.length + 1];
    }

    // Initialize the MRF ==> this will clear out postings readers cache!
    try {
      mMRF.initialize();
    } catch (ConfigurationException e) {
      sLogger.error("Error initializing MRF. Aborting ranking!");
      return null;
    }

    // Cliques associated with the MRF.
    cliques_all = new ArrayList();
    List<Clique> cliques = mMRF.getCliques();
    for (int i = 0; i < cliques.size(); i++) {
      cliques_all.add(cliques.get(i));
    }

    // Cascade stage starts at 0
    cascadeStage = 0;

    cnt = 0;

    String pruner = null;
    float pruner_param = -1;

    long startTime = System.currentTimeMillis();

    int termMatches = 0;

    while (cnt != cliques_all.size()) { // if not have gone thru all cascade stages

      float subTotal_cascadeCost = 0;

      long endTime = System.currentTimeMillis();

      startTime = System.currentTimeMillis();

      if (cascadeStage < 1) { // only call once, then use keptDocs[][][]
        mMRF.removeAllCliques();

        for (Clique c : cliques_all) {
          int cs = ((Clique_cascade) c).getCascadeStage();
          if (cascadeStage == cs) {

            // c.resetPostingsListReader();
            mMRF.addClique(c);
            cnt++;
            // mNumResults = c.getNumResults();
            pruner = ((Clique_cascade) c).getPruner();
            pruner_param = ((Clique_cascade) c).getPruner_param();

            if (cascadeStage == 0) {

              int numDocs = Integer.MAX_VALUE;

              if (mDocSet == null) {
                try { // c.getNumberOfPostings() is not supported for bigram postings readers

                  numDocs = ((Clique_cascade) c).getNumberOfPostings();
                } catch (Exception e) {
                }

                // (not) ignore cost of first stage from the cost model
                subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost * numDocs;
              } else {
                subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost;
              }
            } else {
              subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost;
            }
          }
        }

        if (mDocSet != null) {

          // Lidan: mDocSet[] & accumulated_scores[] should be sorted by doc scores!
          // Lidan: this method opereates on mDocSet[] & accumulated_scores[]!
          pruneDocuments(pruner, pruner_param);

          // Lidan: will score all documents in the retained documenet set
          mNumResults = mDocSet.length;

          sortDocumentsByDocnos();

          // Cost = cost of applying the feature on the retained documents after pruning
          subTotal_cascadeCost = subTotal_cascadeCost * mNumResults;

        } else {
          // Lidan: first cascade stage, just output 20000 documents

          mNumResults = numOutputs_firstStage; // 20000;

          if (cascadeStage != 0) {
            System.out.println("Should be the first stage here!");
            System.exit(-1);
          }
        }

        // Create single pool of reusable accumulators.
        mAccumulators = new Accumulator_cascade[mNumResults + 1];
        for (int i = 0; i < mNumResults + 1; i++) {
          mAccumulators[i] = new Accumulator_cascade(0, 0.0f);
        }

        results = rank_cascade();

        cascadeStage++;
      } else {
        String featureID = null;
        String scoringFunction = null;
        int mSize = -1;
        String[][] concepts_this_stage = new String[cliques_all.size()][];
        float[] clique_wgts = new float[concepts_this_stage.length];

        int cntConcepts = 0;

        for (Clique c : cliques_all) {
          int cs = ((Clique_cascade) c).getCascadeStage();
          if (cascadeStage == cs) {
            cnt++;
            pruner = ((Clique_cascade) c).getPruner();
            pruner_param = ((Clique_cascade) c).getPruner_param();

            featureID =
                ((Clique_cascade) c).getParamID().trim(); // termWt, orderedWt, unorderedWt				
            scoringFunction = ((Clique_cascade) c).getScoringFunctionName(); // dirichlet, bm25

            mSize = ((Clique_cascade) c).getWindowSize(); // window width
            if (mSize == -1 && !(featureID.equals("termWt"))) {
              System.out.println("Only term features don't support getWindowSize()! " + featureID);
              System.exit(-1);
            }
            concepts_this_stage[cntConcepts] = ((Clique_cascade) c).getSingleTerms();
            clique_wgts[cntConcepts] = c.getWeight();

            cntConcepts++;
            subTotal_cascadeCost += ((Clique_cascade) c).mUnitCost;
          }
        }

        // for use in pruning

        // score-based
        float max_score = results[0].score;
        float min_score = results[results.length - 1].score;
        float score_threshold = (max_score - min_score) * pruner_param + min_score;

        float mean_max_score_threshold =
            pruner_param * max_score + (1.0f - pruner_param) * meanScore;

        // rank-based
        int retainSize = (int) ((1.0 - pruner_param) * ((double) (results.length)));

        int keepVal = results.length;

        int size = 0;

        // Clear priority queue.
        mSortedAccumulators.clear();

        float[] termCollectionFreqs = new float[cntConcepts];
        float[] termDFs = new float[cntConcepts];
        int[][] termIndexes = new int[cntConcepts][];

        float sumScore = 0;

        for (int j = 0; j < cntConcepts; j++) {

          // String c = concepts_this_stage[j];

          String[] singleTerms = concepts_this_stage[j]; // c.split("\\s+");

          int termIndex1 = Integer.parseInt((String) (term_to_cliqueNumber.get(singleTerms[0])));

          if (featureID.indexOf("termWt") != -1) {
            float termCollectionFreq =
                Float.parseFloat((String) (term_to_termCollectionFrequency.get(singleTerms[0])));
            termCollectionFreqs[j] = termCollectionFreq;

            float termDF = Float.parseFloat((String) (term_to_termDF.get(singleTerms[0])));
            termDFs[j] = termDF;

            termIndexes[j] = new int[1];
            termIndexes[j][0] = termIndex1;

            if (singleTerms.length != 1) {
              System.out.println("Should have length 1 " + singleTerms.length);
              System.exit(-1);
            }
          } else {
            int termIndex2 = Integer.parseInt((String) (term_to_cliqueNumber.get(singleTerms[1])));

            termIndexes[j] = new int[2];
            termIndexes[j][0] = termIndex1;
            termIndexes[j][1] = termIndex2;

            if (singleTerms.length != 2) {
              System.out.println("Should have length 2 " + singleTerms.length);
              System.exit(-1);
            }
          }
        }

        startTime = System.currentTimeMillis();

        // iterate over results documents, which are sorted in scores
        for (int i = 0; i < results.length; i++) {
          // pruning, if okay, scoring, update pruning stats for next cascade stage

          boolean passedPruning = false;
          if (pruner.equals("rank")) {
            if (i < retainSize) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruner.equals("score")) {
            if (results[i].score > score_threshold) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruner.equals("mean-max")) {
            if (results[i].score > mean_max_score_threshold) {
              passedPruning = true;
            } else {
              if (size < mK && mK != defaultNumDocs) {
                passedPruning = true;
              } else {
                break;
              }
            }
          } else if (pruner.equals("z-score")) {
            float z_score = (results[i].score - meanScore) / stddev;
          } else {
            // System.out.println("Not supported pruner! "+pruner);
          }

          if (passedPruning) {

            size++;

            int docIndex = results[i].index_into_keptDocs;

            int docLen = keptDocLengths[docIndex];

            float docScore_cascade = 0;

            for (int j = 0; j < cntConcepts; j++) {
              // String c = concepts_this_stage[j];
              String[] singleTerms = concepts_this_stage[j]; // c.split("\\s+");

              if (featureID.equals("termWt")) {

                int termIndex1 =
                    termIndexes[j][
                        0]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[0])));
                int[] positions1 = keptDocs[docIndex][termIndex1];

                int tf = 0;

                if (positions1 != null) {
                  tf = positions1.length;
                }
                float termCollectionFreq =
                    termCollectionFreqs[
                        j]; // Float.parseFloat((String)(term_to_termCollectionFrequency.get(singleTerms[0])));
                float termDF = termDFs[j];

                docScore_cascade +=
                    clique_wgts[j]
                        * getScore(tf, docLen, termCollectionFreq, termDF, scoringFunction);

              } else { // term proximity

                // merge into a single stream and compute matches. Assume there are only two
                // terms!!!

                int termIndex1 =
                    termIndexes[j][
                        0]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[0])));
                int termIndex2 =
                    termIndexes[j][
                        1]; // Integer.parseInt((String)(term_to_cliqueNumber.get(singleTerms[1])));

                int[] positions1 = keptDocs[docIndex][termIndex1];
                int[] positions2 = keptDocs[docIndex][termIndex2];

                int matches = 0;

                if (positions1 != null && positions2 != null) { // both query terms are in the doc

                  termMatches++;
                  int[] ids = new int[positions1.length];
                  Arrays.fill(ids, 0);
                  int length = positions1.length;

                  int length2 = positions2.length;

                  int[] newPositions = new int[length + length2];
                  int[] newIds = new int[length + length2];

                  int posA = 0;
                  int posB = 0;

                  int ii = 0;
                  while (ii < length + length2) {
                    if (posB == length2 || posA < length && positions1[posA] <= positions2[posB]) {
                      newPositions[ii] = positions1[posA];
                      newIds[ii] = ids[posA];
                      posA++;
                    } else {
                      newPositions[ii] = positions2[posB];
                      newIds[ii] = 1;
                      posB++;
                    }
                    ii++;
                  }

                  int[] positions = newPositions;
                  ids = newIds;

                  BitSet mMatchedIds = new BitSet(2); // Assume there are only two terms!!!

                  if (featureID.equals("orderedWt")) {

                    for (ii = 0; ii < positions.length; ii++) {
                      mMatchedIds.clear();
                      int maxGap = 0;
                      boolean ordered = true;
                      mMatchedIds.set(ids[ii]);
                      int matchedIDCounts = 1;
                      int lastMatchedID = ids[ii];
                      int lastMatchedPos = positions[ii];

                      for (int jj = ii + 1; jj < positions.length; jj++) {
                        int curID = ids[jj];
                        int curPos = positions[jj];
                        if (!mMatchedIds.get(curID)) {
                          mMatchedIds.set(curID);
                          matchedIDCounts++;
                          if (curID < lastMatchedID) {
                            ordered = false;
                          }
                          if (curPos - lastMatchedPos > maxGap) {
                            maxGap = curPos - lastMatchedPos;
                          }
                        }
                        // stop looking if the maximum gap is too large
                        // or the terms appear out of order
                        if (maxGap > mSize || !ordered) {
                          break;
                        }
                        // did we match all the terms, and in order?
                        if (matchedIDCounts == 2 && ordered) {
                          matches++;
                          break;
                        }
                      }
                    }
                  } else if (featureID.equals("unorderedWt")) {

                    for (ii = 0; ii < positions.length; ii++) {
                      mMatchedIds.clear();

                      mMatchedIds.set(ids[ii]);
                      int matchedIDCounts = 1;
                      int startPos = positions[ii];

                      for (int jj = ii + 1; jj < positions.length; jj++) {
                        int curID = ids[jj];
                        int curPos = positions[jj];
                        int windowSize = curPos - startPos + 1;

                        if (!mMatchedIds.get(curID)) {
                          mMatchedIds.set(curID);
                          matchedIDCounts++;
                        }
                        // stop looking if we've exceeded the maximum window size
                        if (windowSize > mSize) {
                          break;
                        }
                        // did we match all the terms?
                        if (matchedIDCounts == 2) {
                          matches++;
                          break;
                        }
                      }
                    }
                  } else {
                    System.out.println("Invalid featureID " + featureID);
                    System.exit(-1);
                  }
                } // end if this is a match, i.e., both query terms are in the doc

                float s =
                    getScore(
                        matches,
                        docLen,
                        RetrievalEnvironment.mDefaultCf,
                        (float) RetrievalEnvironment.mDefaultDf,
                        scoringFunction);
                docScore_cascade += clique_wgts[j] * s;
              } // end else it's proximity feature
            } // end for (each concept)

            // accumulate doc score in results[i] across cascade stages
            results[i].score += docScore_cascade;

            mSortedAccumulators.add(results[i]);

            sumScore += results[i].score;
          } // end if passed pruning
        } // end iterating over docs

        endTime = System.currentTimeMillis();

        // order based on new scores in results[], put into priority queue

        if (size != mSortedAccumulators.size()) {
          System.out.println(
              "They should be equal right here " + size + " " + mSortedAccumulators.size());
          System.exit(-1);
        }

        results_tmp = new Accumulator_cascade[size];

        // meanScore = 0;

        meanScore =
            sumScore / (float) size; // update stats for use in pruning in next cascade stage
        stddev = 0;

        for (int i = 0; i < results_tmp.length; i++) {
          results_tmp[results_tmp.length - 1 - i] = mSortedAccumulators.poll();
          // meanScore += results_tmp[results_tmp.length - 1 - i].score;			//Lidan: before it was
          // like this, when not doing z-score

          stddev +=
              (results_tmp[results_tmp.length - 1 - i].score - meanScore)
                  * (results_tmp[results_tmp.length - 1 - i].score - meanScore);
        }
        results = results_tmp;

        stddev = (float) Math.sqrt(stddev);

        // Create single pool of reusable accumulators.
        // Use mNumResults from prev iteration, since we don't know how many docs are kept until
        // we're done iterating through the documents
        // int retainSize = 0;
        // mAccumulators = new Accumulator[mNumResults + 1];
        // for (int i = 0; i < mNumResults + 1; i++) {
        //	mAccumulators[i] = new Accumulator(0, 0.0f);
        // }

        cascadeStage++;

        subTotal_cascadeCost = subTotal_cascadeCost * size;
      } // end if not first stage

      cascadeCost += subTotal_cascadeCost;
    } // end while

    long endTime = System.currentTimeMillis();

    Accumulator_cascade[] results_return = results;

    if (results.length > mK) { // RetrievalEnvironment.mCascade_K){

      results_return = new Accumulator_cascade[mK]; // RetrievalEnvironment.mCascade_K];

      for (int i = 0; i < mK; i++) { // RetrievalEnvironment.mCascade_K; i++){
        results_return[i] = new Accumulator_cascade(results[i].docno, results[i].score);
        // results_return[i].docno = results[i].docno;
        // results_return[i].score = results[i].score;
      }
    }

    return results_return;
  }