/**
  * combine weights according to expansion formula
  *
  * @param queryTerms
  * @param relevantDocsTerms
  * @param irrelevantDocsTerms
  * @return
  */
 public List<TermQuery> combine(
     Map<String, TermQuery> queryTerms,
     Map<String, TermQuery> relevantDocsTerms,
     Map<String, TermQuery> irrelevantDocsTerms) {
   // Add Terms of the relevant documents
   for (Map.Entry<String, TermQuery> e : queryTerms.entrySet()) {
     if (relevantDocsTerms.containsKey(e.getKey())) {
       TermQuery tq = relevantDocsTerms.get(e.getKey());
       tq.setBoost(tq.getBoost() + e.getValue().getBoost());
     } else {
       relevantDocsTerms.put(e.getKey(), e.getValue());
     }
   }
   // Substract terms of irrelevant documents
   for (Map.Entry<String, TermQuery> e : irrelevantDocsTerms.entrySet()) {
     if (relevantDocsTerms.containsKey(e.getKey())) {
       TermQuery tq = relevantDocsTerms.get(e.getKey());
       tq.setBoost(tq.getBoost() - e.getValue().getBoost());
     } else {
       TermQuery tq = e.getValue();
       tq.setBoost(-tq.getBoost());
       relevantDocsTerms.put(e.getKey(), tq);
     }
   }
   return new ArrayList<>(relevantDocsTerms.values());
 }
  /**
   * Adjust termClaimsDescriptionAbstractTitle features of the docs with alpha * query; and beta;
   * and assign weights/boost to termClaimsDescriptionAbstractTitles (tf*idf).
   *
   * @param query
   * @param currentField
   * @param alpha
   * @param beta - factor of the equation
   * @param gamma
   * @param decay
   * @param maxExpandedQueryTerms - maximum number of termClaimsDescriptionAbstractTitles in
   *     expanded query
   * @return expandedQuery with boost factors adjusted using Rocchio's algorithm
   * @throws IOException
   * @throws ParseException
   */
  public Query adjust(
      Query query,
      String currentField,
      float alpha,
      float beta,
      float gamma,
      float decay,
      int maxExpandedQueryTerms)
      throws IOException, ParseException {
    Query expandedQuery;
    // setBoost of docs terms
    Map<String, TermQuery> relevantDocsTerms =
        setBoost(docsTermVectorReldocs, currentField, beta, decay);
    Map<String, TermQuery> irrrelevantDocsTerms =
        setBoost(docsTermVectorIrreldocs, currentField, gamma, decay);
    //        Map<String, TermQuery> relevantDocsTerms = new HashMap<>();
    //        Map<String, TermQuery> irrrelevantDocsTerms = new HashMap<>();
    // setBoost of query terms
    // Get queryTerms from the query

    // combine weights according to expansion formula
    List<TermQuery> expandedQueryTerms =
        combine(new HashMap<String, TermQuery>(), relevantDocsTerms, irrrelevantDocsTerms);
    // Sort by boost=weight
    Comparator comparator = new QueryBoostComparator();
    Collections.sort(expandedQueryTerms, comparator);
    relevantDocsTerms.clear();
    int termCount = Math.min(expandedQueryTerms.size(), maxExpandedQueryTerms);
    for (int i = 0; i < termCount; i++) {
      TermQuery tq = expandedQueryTerms.get(i);
      relevantDocsTerms.put(tq.getTerm().text(), tq);
      System.out.print(tq.getTerm().text() + ", ");
    }
    TermFreqVector queryTermsVector = new TermFreqVector(query);
    Map<String, TermQuery> queryTerms;

    queryTerms = setBoost(queryTermsVector, currentField, alpha);

    //        List<TermQuery> queryTermsList=new ArrayList(queryTerms.values());
    //        Collections.sort(queryTermsList, comparator);
    //        queryTerms.clear();
    //        for(TermQuery tq:queryTermsList){
    //            queryTerms.put(tq.getTerm().text(), tq);
    //        }
    expandedQueryTerms = combine(queryTerms, relevantDocsTerms, new HashMap<String, TermQuery>());
    Collections.sort(expandedQueryTerms, comparator);
    // Create Expanded Query
    expandedQuery = mergeQueries(expandedQueryTerms, Integer.MAX_VALUE);

    return expandedQuery;
  }
 public Map<String, Float> getRocchioVector(String currentField) throws IOException {
   Map<String, Float> out = new HashMap<>();
   float beta = parameters.get(RocchioQueryExpansion.ROCCHIO_BETA_FLD);
   float gamma = parameters.get(RocchioQueryExpansion.ROCCHIO_GAMMA_FLD);
   float decay = parameters.get(RocchioQueryExpansion.DECAY_FLD);
   Map<String, TermQuery> relevantDocsTerms =
       setBoost(docsTermVectorReldocs, currentField, beta, decay);
   Map<String, TermQuery> irrrelevantDocsTerms =
       setBoost(docsTermVectorIrreldocs, currentField, gamma, decay);
   List<TermQuery> expandedQueryTerms =
       combine(
           new HashMap<String, TermQuery>(), relevantDocsTerms, new HashMap<String, TermQuery>());
   for (TermQuery tq : expandedQueryTerms) {
     out.put(tq.getTerm().text(), tq.getBoost());
   }
   return out;
 }
  /**
   * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf)
   *
   * @param vecsTerms
   * @param currentField
   * @param factor - adjustment factor ( ex. alpha or beta )
   * @param decayFactor
   * @return
   * @throws java.io.IOException
   */
  public Map<String, TermQuery> setBoost(
      Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor)
      throws IOException {
    Map<String, TermQuery> terms = new HashMap<>();
    // setBoost for each of the terms of each of the docs
    int i = 0;
    float norm = (float) 1 / vecsTerms.size();
    //        System.out.println("--------------------------");
    for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) {
      // Increase decay
      String field = e.getValue();
      TermFreqVector docTerms = e.getKey();
      float decay = decayFactor * i;
      // Populate terms: with TermQuries and set boost
      for (String termTxt : docTerms.getTerms()) {
        // Create Term
        Term term = new Term(currentField, termTxt);
        // Calculate weight
        float tf = docTerms.getFreq(termTxt);
        //                float idf = ir.docFreq(termTitle);
        int docs;
        float idf;
        if (sourceField.equals(PatentQuery.all)) {
          docs = ir.getDocCount(field);
          idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1));
        } else {
          docs = ir.getDocCount(sourceField);
          idf =
              (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1));
        }
        float weight = tf * idf;

        //                System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + "
        // tfidf= " + weight);
        // Adjust weight by decay factor
        weight = weight - (weight * decay);
        // Create TermQuery and add it to the collection
        TermQuery termQuery = new TermQuery(term);
        // Calculate and set boost
        float boost;
        if (vecsTerms.size() == 1) {
          boost = factor * tf;
        } else {
          boost = factor;
        }

        if (boost != 0) {
          termQuery.setBoost(boost * norm);
          if (terms.containsKey(termTxt)) {
            TermQuery tq = terms.get(termTxt);
            tq.setBoost(tq.getBoost() + termQuery.getBoost());
          } else {
            terms.put(termTxt, termQuery);
          }
        }
      }
      i++;
    }
    return terms;
  }
Пример #5
0
  /** Add to an existing boolean query the More Like This query from this PriorityQueue */
  private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
    ScoreTerm scoreTerm;
    float bestScore = -1;

    while ((scoreTerm = q.pop()) != null) {
      TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));

      if (boost) {
        if (bestScore == -1) {
          bestScore = (scoreTerm.score);
        }
        float myScore = (scoreTerm.score);
        tq.setBoost(boostFactor * myScore / bestScore);
      }

      try {
        query.add(tq, BooleanClause.Occur.SHOULD);
      } catch (BooleanQuery.TooManyClauses ignore) {
        break;
      }
    }
  }