/**
   * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf)
   *
   * @param vecsTerms
   * @param currentField
   * @param factor - adjustment factor ( ex. alpha or beta )
   * @param decayFactor
   * @return
   * @throws java.io.IOException
   */
  public Map<String, TermQuery> setBoost(
      Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor)
      throws IOException {
    Map<String, TermQuery> terms = new HashMap<>();
    // setBoost for each of the terms of each of the docs
    int i = 0;
    float norm = (float) 1 / vecsTerms.size();
    //        System.out.println("--------------------------");
    for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) {
      // Increase decay
      String field = e.getValue();
      TermFreqVector docTerms = e.getKey();
      float decay = decayFactor * i;
      // Populate terms: with TermQuries and set boost
      for (String termTxt : docTerms.getTerms()) {
        // Create Term
        Term term = new Term(currentField, termTxt);
        // Calculate weight
        float tf = docTerms.getFreq(termTxt);
        //                float idf = ir.docFreq(termTitle);
        int docs;
        float idf;
        if (sourceField.equals(PatentQuery.all)) {
          docs = ir.getDocCount(field);
          idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1));
        } else {
          docs = ir.getDocCount(sourceField);
          idf =
              (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1));
        }
        float weight = tf * idf;

        //                System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + "
        // tfidf= " + weight);
        // Adjust weight by decay factor
        weight = weight - (weight * decay);
        // Create TermQuery and add it to the collection
        TermQuery termQuery = new TermQuery(term);
        // Calculate and set boost
        float boost;
        if (vecsTerms.size() == 1) {
          boost = factor * tf;
        } else {
          boost = factor;
        }

        if (boost != 0) {
          termQuery.setBoost(boost * norm);
          if (terms.containsKey(termTxt)) {
            TermQuery tq = terms.get(termTxt);
            tq.setBoost(tq.getBoost() + termQuery.getBoost());
          } else {
            terms.put(termTxt, termQuery);
          }
        }
      }
      i++;
    }
    return terms;
  }
Ejemplo n.º 2
0
  public void handleRequest(SolrQueryRequest req, SolrQueryResponse rsp) {
    numRequests++;

    Query query = null;
    Filter filter = null;

    List<String> commands = StrUtils.splitSmart(req.getQueryString(), ';');

    String qs = commands.size() >= 1 ? commands.get(0) : "";
    query = QueryParsing.parseQuery(qs, req.getSchema());

    // If the first non-query, non-filter command is a simple sort on an indexed field, then
    // we can use the Lucene sort ability.
    Sort sort = null;
    if (commands.size() >= 2) {
      sort = QueryParsing.parseSort(commands.get(1), req);
    }

    try {

      int numHits;
      ScoreDoc[] scoreDocs;
      if (sort != null) {
        TopFieldDocs hits =
            req.getSearcher().search(query, filter, req.getStart() + req.getLimit(), sort);
        scoreDocs = hits.scoreDocs;
        numHits = hits.totalHits;
      } else {
        TopDocs hits = req.getSearcher().search(query, filter, req.getStart() + req.getLimit());
        scoreDocs = hits.scoreDocs;
        numHits = hits.totalHits;
      }

      int startRow = Math.min(numHits, req.getStart());
      int endRow = Math.min(numHits, req.getStart() + req.getLimit());
      int numRows = endRow - startRow;

      int[] ids = new int[numRows];
      Document[] data = new Document[numRows];
      for (int i = startRow; i < endRow; i++) {
        ids[i] = scoreDocs[i].doc;
        data[i] = req.getSearcher().doc(ids[i]);
      }

      rsp.add(null, new DocSlice(0, numRows, ids, null, numHits, 0.0f));

      /**
       * ********************* rsp.setResults(new DocSlice(0,numRows,ids,null,numHits));
       *
       * <p>// Setting the actual document objects is optional rsp.setResults(data);
       * **********************
       */
    } catch (IOException e) {
      rsp.setException(e);
      numErrors++;
      return;
    }
  }
  public ArrayList<String> filterSearchQueryString(ArrayList<String> qstring) throws IOException {
    Term vword = new Term("");
    int docFreq = indexReader.docFreq(vword);
    int numDocs = indexReader.maxDoc();
    float idf = (float) Math.log(numDocs / (double) (docFreq + 1) + 1.0);

    return qstring;
  }
 private String msg(String left, String right) {
   int size = Math.min(left.length(), right.length());
   StringBuilder builder = new StringBuilder("size: " + left.length() + " vs. " + right.length());
   builder.append(" content: <<");
   for (int i = 0; i < size; i++) {
     if (left.charAt(i) == right.charAt(i)) {
       builder.append(left.charAt(i));
     } else {
       builder
           .append(">> ")
           .append("until offset: ")
           .append(i)
           .append(" [")
           .append(left.charAt(i))
           .append(" vs.")
           .append(right.charAt(i))
           .append("] [")
           .append((int) left.charAt(i))
           .append(" vs.")
           .append((int) right.charAt(i))
           .append(']');
       return builder.toString();
     }
   }
   if (left.length() != right.length()) {
     int leftEnd = Math.max(size, left.length()) - 1;
     int rightEnd = Math.max(size, right.length()) - 1;
     builder
         .append(">> ")
         .append("until offset: ")
         .append(size)
         .append(" [")
         .append(left.charAt(leftEnd))
         .append(" vs.")
         .append(right.charAt(rightEnd))
         .append("] [")
         .append((int) left.charAt(leftEnd))
         .append(" vs.")
         .append((int) right.charAt(rightEnd))
         .append(']');
     return builder.toString();
   }
   return "";
 }
 /**
  * Merges <code>termClaimsDescriptionAbstractTitleQueries</code> into a single query. In the
  * future this method should probably be in <code>Query</code> class. This is akward way of doing
  * it; but only merge queries method that is available is mergeBooleanQueries; so actually have to
  * make a string termClaimsDescriptionAbstractTitle1^boost1,
  * termClaimsDescriptionAbstractTitle2^boost and then parse it into a query
  *
  * @param termQueries - to merge
  * @param maxTerms
  * @return query created from termClaimsDescriptionAbstractTitleQueries including boost parameters
  * @throws org.apache.lucene.queryparser.classic.ParseException
  */
 public Query mergeQueries(List<TermQuery> termQueries, int maxTerms) throws ParseException {
   BooleanQuery query = new BooleanQuery();
   // Select only the maxTerms number of terms
   int termCount = Math.min(termQueries.size(), maxTerms);
   for (int i = 0; i < termCount; i++) {
     TermQuery termQuery = termQueries.get(i);
     query.add(termQuery, BooleanClause.Occur.SHOULD);
   }
   return query;
 }
  /**
   * Adjust termClaimsDescriptionAbstractTitle features of the docs with alpha * query; and beta;
   * and assign weights/boost to termClaimsDescriptionAbstractTitles (tf*idf).
   *
   * @param query
   * @param currentField
   * @param alpha
   * @param beta - factor of the equation
   * @param gamma
   * @param decay
   * @param maxExpandedQueryTerms - maximum number of termClaimsDescriptionAbstractTitles in
   *     expanded query
   * @return expandedQuery with boost factors adjusted using Rocchio's algorithm
   * @throws IOException
   * @throws ParseException
   */
  public Query adjust(
      Query query,
      String currentField,
      float alpha,
      float beta,
      float gamma,
      float decay,
      int maxExpandedQueryTerms)
      throws IOException, ParseException {
    Query expandedQuery;
    // setBoost of docs terms
    Map<String, TermQuery> relevantDocsTerms =
        setBoost(docsTermVectorReldocs, currentField, beta, decay);
    Map<String, TermQuery> irrrelevantDocsTerms =
        setBoost(docsTermVectorIrreldocs, currentField, gamma, decay);
    //        Map<String, TermQuery> relevantDocsTerms = new HashMap<>();
    //        Map<String, TermQuery> irrrelevantDocsTerms = new HashMap<>();
    // setBoost of query terms
    // Get queryTerms from the query

    // combine weights according to expansion formula
    List<TermQuery> expandedQueryTerms =
        combine(new HashMap<String, TermQuery>(), relevantDocsTerms, irrrelevantDocsTerms);
    // Sort by boost=weight
    Comparator comparator = new QueryBoostComparator();
    Collections.sort(expandedQueryTerms, comparator);
    relevantDocsTerms.clear();
    int termCount = Math.min(expandedQueryTerms.size(), maxExpandedQueryTerms);
    for (int i = 0; i < termCount; i++) {
      TermQuery tq = expandedQueryTerms.get(i);
      relevantDocsTerms.put(tq.getTerm().text(), tq);
      System.out.print(tq.getTerm().text() + ", ");
    }
    TermFreqVector queryTermsVector = new TermFreqVector(query);
    Map<String, TermQuery> queryTerms;

    queryTerms = setBoost(queryTermsVector, currentField, alpha);

    //        List<TermQuery> queryTermsList=new ArrayList(queryTerms.values());
    //        Collections.sort(queryTermsList, comparator);
    //        queryTerms.clear();
    //        for(TermQuery tq:queryTermsList){
    //            queryTerms.put(tq.getTerm().text(), tq);
    //        }
    expandedQueryTerms = combine(queryTerms, relevantDocsTerms, new HashMap<String, TermQuery>());
    Collections.sort(expandedQueryTerms, comparator);
    // Create Expanded Query
    expandedQuery = mergeQueries(expandedQueryTerms, Integer.MAX_VALUE);

    return expandedQuery;
  }
Ejemplo n.º 7
0
  /**
   * Create a PriorityQueue from a word->tf map.
   *
   * @param words a map of words keyed on the word(String) with Int objects as the values.
   * @param fieldNames an array of field names to override defaults.
   */
  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames)
      throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
    final int limit = Math.min(maxQueryTerms, words.size());
    FreqQ queue = new FreqQ(limit); // will order words by score

    for (String word : words.keySet()) { // for every word
      int tf = words.get(word).x; // term freq in the source doc
      if (minTermFreq > 0 && tf < minTermFreq) {
        continue; // filter out words that don't occur enough times in the source
      }

      // go through all the fields and find the largest document frequency
      String topField = fieldNames[0];
      int docFreq = 0;
      for (String fieldName : fieldNames) {
        int freq = ir.docFreq(new Term(fieldName, word));
        topField = (freq > docFreq) ? fieldName : topField;
        docFreq = (freq > docFreq) ? freq : docFreq;
      }

      if (minDocFreq > 0 && docFreq < minDocFreq) {
        continue; // filter out words that don't occur in enough docs
      }

      if (docFreq > maxDocFreq) {
        continue; // filter out words that occur in too many docs
      }

      if (docFreq == 0) {
        continue; // index update problem?
      }

      float idf = similarity.idf(docFreq, numDocs);
      float score = tf * idf;

      if (queue.size() < limit) {
        // there is still space in the queue
        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
      } else {
        ScoreTerm term = queue.top();
        if (term.score < score) { // update the smallest in the queue in place and update the queue.
          term.update(word, topField, score, idf, docFreq, tf);
          queue.updateTop();
        }
      }
    }
    return queue;
  }
 @Override
 public DocScoreList mostSimilar(String phrase, int maxResults, TIntSet validIds)
     throws IOException {
   final TIntDoubleHashMap scores = getConceptVector(phrase, validIds);
   Integer luceneIds[] = ArrayUtils.toObject(scores.keys());
   Arrays.sort(
       luceneIds,
       new Comparator<Integer>() {
         @Override
         public int compare(Integer id1, Integer id2) {
           return -1 * new Double(scores.get(id1)).compareTo(scores.get(id2));
         }
       });
   DocScoreList result = new DocScoreList(Math.min(luceneIds.length, maxResults));
   for (int i = 0; i < result.numDocs(); i++) {
     result.set(i, esaHelper.luceneIdToWpId(luceneIds[i]), scores.get(luceneIds[i]));
   }
   return normalize(result);
 }
 private TopDocs rerank(TopDocs docs, LireFeature feature, IndexReader reader)
     throws IOException, IllegalAccessException, InstantiationException {
   LireFeature tmp = new CEDD();
   ArrayList<ScoreDoc> res = new ArrayList<ScoreDoc>(docs.scoreDocs.length);
   float maxScore = 0f;
   for (int i = 0; i < docs.scoreDocs.length; i++) {
     tmp.setByteArrayRepresentation(
         reader
             .document(docs.scoreDocs[i].doc)
             .getField(DocumentBuilder.FIELD_NAME_CEDD)
             .binaryValue()
             .bytes,
         reader
             .document(docs.scoreDocs[i].doc)
             .getField(DocumentBuilder.FIELD_NAME_CEDD)
             .binaryValue()
             .offset,
         reader
             .document(docs.scoreDocs[i].doc)
             .getField(DocumentBuilder.FIELD_NAME_CEDD)
             .binaryValue()
             .length);
     maxScore = Math.max(1 / tmp.getDistance(feature), maxScore);
     res.add(new ScoreDoc(docs.scoreDocs[i].doc, 1 / tmp.getDistance(feature)));
   }
   // sorting res ...
   Collections.sort(
       res,
       new Comparator<ScoreDoc>() {
         @Override
         public int compare(ScoreDoc o1, ScoreDoc o2) {
           return (int) Math.signum(o2.score - o1.score);
         }
       });
   return new TopDocs(numImagesEval, (ScoreDoc[]) res.toArray(new ScoreDoc[res.size()]), maxScore);
 }
  @Override
  public void collect(int parentDoc) throws IOException {
    // System.out.println("\nC parentDoc=" + parentDoc);
    totalHitCount++;

    float score = Float.NaN;

    if (trackMaxScore) {
      score = scorer.score();
      maxScore = Math.max(maxScore, score);
    }

    // TODO: we could sweep all joinScorers here and
    // aggregate total child hit count, so we can fill this
    // in getTopGroups (we wire it to 0 now)

    if (queueFull) {
      // System.out.println("  queueFull");
      // Fastmatch: return if this hit is not competitive
      for (int i = 0; ; i++) {
        final int c = reverseMul[i] * comparators[i].compareBottom(parentDoc);
        if (c < 0) {
          // Definitely not competitive.
          // System.out.println("    skip");
          return;
        } else if (c > 0) {
          // Definitely competitive.
          break;
        } else if (i == compEnd) {
          // Here c=0. If we're at the last comparator, this doc is not
          // competitive, since docs are visited in doc Id order, which means
          // this doc cannot compete with any other document in the queue.
          // System.out.println("    skip");
          return;
        }
      }

      // System.out.println("    competes!  doc=" + (docBase + parentDoc));

      // This hit is competitive - replace bottom element in queue & adjustTop
      for (int i = 0; i < comparators.length; i++) {
        comparators[i].copy(bottom.slot, parentDoc);
      }
      if (!trackMaxScore && trackScores) {
        score = scorer.score();
      }
      bottom.doc = docBase + parentDoc;
      bottom.readerContext = currentReaderContext;
      bottom.score = score;
      copyGroups(bottom);
      bottom = queue.updateTop();

      for (int i = 0; i < comparators.length; i++) {
        comparators[i].setBottom(bottom.slot);
      }
    } else {
      // Startup transient: queue is not yet full:
      final int comparatorSlot = totalHitCount - 1;

      // Copy hit into queue
      for (int i = 0; i < comparators.length; i++) {
        comparators[i].copy(comparatorSlot, parentDoc);
      }
      // System.out.println("  startup: new OG doc=" + (docBase+parentDoc));
      if (!trackMaxScore && trackScores) {
        score = scorer.score();
      }
      final OneGroup og =
          new OneGroup(comparatorSlot, docBase + parentDoc, score, joinScorers.length, trackScores);
      og.readerContext = currentReaderContext;
      copyGroups(og);
      bottom = queue.add(og);
      queueFull = totalHitCount == numParentHits;
      if (queueFull) {
        // End of startup transient: queue just filled up:
        for (int i = 0; i < comparators.length; i++) {
          comparators[i].setBottom(bottom.slot);
        }
      }
    }
  }
Ejemplo n.º 11
0
  @Test
  public void testRandom() throws Exception {
    Directory directory = newDirectory();
    final Random r = random();
    final IndexWriterConfig iwc =
        LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r))
            .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
            .setRAMBufferSizeMB(
                scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here
    RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc);
    int numUniqueChildValues = scaledRandomIntBetween(100, 2000);
    String[] childValues = new String[numUniqueChildValues];
    for (int i = 0; i < numUniqueChildValues; i++) {
      childValues[i] = Integer.toString(i);
    }

    IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet();

    int childDocId = 0;
    int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues);
    ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds =
        new ObjectObjectOpenHashMap<>();
    for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) {
      boolean markParentAsDeleted = rarely();
      boolean filterMe = rarely();
      String parent = Integer.toString(parentDocId);
      Document document = new Document();
      document.add(
          new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES));
      document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
      if (markParentAsDeleted) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("delete", "me", Field.Store.NO));
      }
      if (filterMe) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("filter", "me", Field.Store.NO));
      }
      indexWriter.addDocument(document);

      int numChildDocs = scaledRandomIntBetween(0, 100);
      for (int i = 0; i < numChildDocs; i++) {
        boolean markChildAsDeleted = rarely();
        String childValue = childValues[random().nextInt(childValues.length)];

        document = new Document();
        document.add(
            new StringField(
                UidFieldMapper.NAME,
                Uid.createUid("child", Integer.toString(childDocId++)),
                Field.Store.NO));
        document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO));
        document.add(
            new StringField(
                ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO));
        document.add(new StringField("field1", childValue, Field.Store.NO));
        if (markChildAsDeleted) {
          document.add(new StringField("delete", "me", Field.Store.NO));
        }
        indexWriter.addDocument(document);

        if (!markChildAsDeleted) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores;
          if (childValueToParentIds.containsKey(childValue)) {
            parentIdToChildScores = childValueToParentIds.lget();
          } else {
            childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>());
          }
          if (!markParentAsDeleted && !filterMe) {
            FloatArrayList childScores = parentIdToChildScores.get(parent);
            if (childScores == null) {
              parentIdToChildScores.put(parent, childScores = new FloatArrayList());
            }
            childScores.add(1f);
          }
        }
      }
    }

    // Delete docs that are marked to be deleted.
    indexWriter.deleteDocuments(new Term("delete", "me"));
    indexWriter.commit();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    Engine.Searcher engineSearcher =
        new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher);
    ((TestSearchContext) SearchContext.current())
        .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));

    int max = numUniqueChildValues / 4;
    for (int i = 0; i < max; i++) {
      // Simulate a parent update
      if (random().nextBoolean()) {
        final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size();
        int numberOfUpdates =
            RandomInts.randomIntBetween(
                random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5));
        for (int j = 0; j < numberOfUpdates; j++) {
          int parentId;
          do {
            parentId = random().nextInt(numParentDocs);
          } while (filteredOrDeletedDocs.contains(parentId));

          String parentUid = Uid.createUid("parent", Integer.toString(parentId));
          indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid));

          Document document = new Document();
          document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES));
          document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
          indexWriter.addDocument(document);
        }

        indexReader.close();
        indexReader = DirectoryReader.open(indexWriter.w, true);
        searcher = new IndexSearcher(indexReader);
        engineSearcher =
            new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
        ((TestSearchContext) SearchContext.current())
            .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));
      }

      String childValue = childValues[random().nextInt(numUniqueChildValues)];
      int shortCircuitParentDocSet = random().nextInt(numParentDocs);
      ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)];
      // leave min/max set to 0 half the time
      int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110);
      int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110);

      QueryBuilder queryBuilder =
          hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue)))
              .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH))
              .minChildren(minChildren)
              .maxChildren(maxChildren)
              .setShortCircuitCutoff(shortCircuitParentDocSet);
      // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not
      // get live docs as acceptedDocs
      queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me")));
      Query query = parseQuery(queryBuilder);
      BitSetCollector collector = new BitSetCollector(indexReader.maxDoc());
      int numHits = 1 + random().nextInt(25);
      TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits);
      searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector));
      FixedBitSet actualResult = collector.getResult();

      FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc());
      TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits);
      if (childValueToParentIds.containsKey(childValue)) {
        LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader);
        final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()];
        Terms terms = slowLeafReader.terms(UidFieldMapper.NAME);
        if (terms != null) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget();
          TermsEnum termsEnum = terms.iterator(null);
          DocsEnum docsEnum = null;
          for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) {
            int count = entry.getValue().elementsCount;
            if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) {
              TermsEnum.SeekStatus seekStatus =
                  termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey()));
              if (seekStatus == TermsEnum.SeekStatus.FOUND) {
                docsEnum =
                    termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
                expectedResult.set(docsEnum.nextDoc());
                scores[docsEnum.docID()] = new FloatArrayList(entry.getValue());
              } else if (seekStatus == TermsEnum.SeekStatus.END) {
                break;
              }
            }
          }
        }
        MockScorer mockScorer = new MockScorer(scoreType);
        final LeafCollector leafCollector =
            expectedTopDocsCollector.getLeafCollector(slowLeafReader.getContext());
        leafCollector.setScorer(mockScorer);
        for (int doc = expectedResult.nextSetBit(0);
            doc < slowLeafReader.maxDoc();
            doc =
                doc + 1 >= expectedResult.length()
                    ? DocIdSetIterator.NO_MORE_DOCS
                    : expectedResult.nextSetBit(doc + 1)) {
          mockScorer.scores = scores[doc];
          leafCollector.collect(doc);
        }
      }

      assertBitSet(actualResult, expectedResult, searcher);
      assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs());
    }

    indexWriter.close();
    indexReader.close();
    directory.close();
  }
 @Override
 public float tf(float freq) {
   return (float) (1.0 + Math.log(freq));
 }
 private static long optionalSum(long left, long right) {
   return Math.min(left, right) == -1 ? -1 : left + right;
 }
  public InternalSearchResponse merge(
      ScoreDoc[] sortedDocs,
      AtomicArray<? extends QuerySearchResultProvider> queryResultsArr,
      AtomicArray<? extends FetchSearchResultProvider> fetchResultsArr) {

    List<? extends AtomicArray.Entry<? extends QuerySearchResultProvider>> queryResults =
        queryResultsArr.asList();
    List<? extends AtomicArray.Entry<? extends FetchSearchResultProvider>> fetchResults =
        fetchResultsArr.asList();

    if (queryResults.isEmpty()) {
      return InternalSearchResponse.empty();
    }

    QuerySearchResult firstResult = queryResults.get(0).value.queryResult();

    boolean sorted = false;
    int sortScoreIndex = -1;
    if (firstResult.topDocs() instanceof TopFieldDocs) {
      sorted = true;
      TopFieldDocs fieldDocs = (TopFieldDocs) firstResult.queryResult().topDocs();
      for (int i = 0; i < fieldDocs.fields.length; i++) {
        if (fieldDocs.fields[i].getType() == SortField.Type.SCORE) {
          sortScoreIndex = i;
        }
      }
    }

    // merge facets
    InternalFacets facets = null;
    if (!queryResults.isEmpty()) {
      // we rely on the fact that the order of facets is the same on all query results
      if (firstResult.facets() != null
          && firstResult.facets().facets() != null
          && !firstResult.facets().facets().isEmpty()) {
        List<Facet> aggregatedFacets = Lists.newArrayList();
        List<Facet> namedFacets = Lists.newArrayList();
        for (Facet facet : firstResult.facets()) {
          // aggregate each facet name into a single list, and aggregate it
          namedFacets.clear();
          for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) {
            for (Facet facet1 : entry.value.queryResult().facets()) {
              if (facet.getName().equals(facet1.getName())) {
                namedFacets.add(facet1);
              }
            }
          }
          if (!namedFacets.isEmpty()) {
            Facet aggregatedFacet =
                ((InternalFacet) namedFacets.get(0))
                    .reduce(new InternalFacet.ReduceContext(cacheRecycler, namedFacets));
            aggregatedFacets.add(aggregatedFacet);
          }
        }
        facets = new InternalFacets(aggregatedFacets);
      }
    }

    // count the total (we use the query result provider here, since we might not get any hits (we
    // scrolled past them))
    long totalHits = 0;
    float maxScore = Float.NEGATIVE_INFINITY;
    boolean timedOut = false;
    Boolean terminatedEarly = null;
    for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) {
      QuerySearchResult result = entry.value.queryResult();
      if (result.searchTimedOut()) {
        timedOut = true;
      }
      if (result.terminatedEarly() != null) {
        if (terminatedEarly == null) {
          terminatedEarly = result.terminatedEarly();
        } else if (result.terminatedEarly()) {
          terminatedEarly = true;
        }
      }
      totalHits += result.topDocs().totalHits;
      if (!Float.isNaN(result.topDocs().getMaxScore())) {
        maxScore = Math.max(maxScore, result.topDocs().getMaxScore());
      }
    }
    if (Float.isInfinite(maxScore)) {
      maxScore = Float.NaN;
    }

    // clean the fetch counter
    for (AtomicArray.Entry<? extends FetchSearchResultProvider> entry : fetchResults) {
      entry.value.fetchResult().initCounter();
    }

    // merge hits
    List<InternalSearchHit> hits = new ArrayList<>();
    if (!fetchResults.isEmpty()) {
      for (ScoreDoc shardDoc : sortedDocs) {
        FetchSearchResultProvider fetchResultProvider = fetchResultsArr.get(shardDoc.shardIndex);
        if (fetchResultProvider == null) {
          continue;
        }
        FetchSearchResult fetchResult = fetchResultProvider.fetchResult();
        int index = fetchResult.counterGetAndIncrement();
        if (index < fetchResult.hits().internalHits().length) {
          InternalSearchHit searchHit = fetchResult.hits().internalHits()[index];
          searchHit.score(shardDoc.score);
          searchHit.shard(fetchResult.shardTarget());

          if (sorted) {
            FieldDoc fieldDoc = (FieldDoc) shardDoc;
            searchHit.sortValues(fieldDoc.fields);
            if (sortScoreIndex != -1) {
              searchHit.score(((Number) fieldDoc.fields[sortScoreIndex]).floatValue());
            }
          }

          hits.add(searchHit);
        }
      }
    }

    // merge suggest results
    Suggest suggest = null;
    if (!queryResults.isEmpty()) {
      final Map<String, List<Suggest.Suggestion>> groupedSuggestions = new HashMap<>();
      boolean hasSuggestions = false;
      for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) {
        Suggest shardResult = entry.value.queryResult().queryResult().suggest();

        if (shardResult == null) {
          continue;
        }
        hasSuggestions = true;
        Suggest.group(groupedSuggestions, shardResult);
      }

      suggest =
          hasSuggestions
              ? new Suggest(Suggest.Fields.SUGGEST, Suggest.reduce(groupedSuggestions))
              : null;
    }

    // merge addAggregation
    InternalAggregations aggregations = null;
    if (!queryResults.isEmpty()) {
      if (firstResult.aggregations() != null && firstResult.aggregations().asList() != null) {
        List<InternalAggregations> aggregationsList = new ArrayList<>(queryResults.size());
        for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : queryResults) {
          aggregationsList.add((InternalAggregations) entry.value.queryResult().aggregations());
        }
        aggregations =
            InternalAggregations.reduce(
                aggregationsList, new ReduceContext(null, bigArrays, scriptService));
      }
    }

    InternalSearchHits searchHits =
        new InternalSearchHits(
            hits.toArray(new InternalSearchHit[hits.size()]), totalHits, maxScore);

    return new InternalSearchResponse(
        searchHits, facets, aggregations, suggest, timedOut, terminatedEarly);
  }
Ejemplo n.º 15
0
  private void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
    SortSpec ss = rb.getSortSpec();
    Sort sort = ss.getSort();

    SortField[] sortFields = null;
    if (sort != null) sortFields = sort.getSort();
    else {
      sortFields = new SortField[] {SortField.FIELD_SCORE};
    }

    SchemaField uniqueKeyField = rb.req.getSchema().getUniqueKeyField();

    // id to shard mapping, to eliminate any accidental dups
    HashMap<Object, String> uniqueDoc = new HashMap<Object, String>();

    // Merge the docs via a priority queue so we don't have to sort *all* of the
    // documents... we only need to order the top (rows+start)
    ShardFieldSortedHitQueue queue;
    queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());

    long numFound = 0;
    Float maxScore = null;
    for (ShardResponse srsp : sreq.responses) {
      SolrDocumentList docs =
          (SolrDocumentList) srsp.getSolrResponse().getResponse().get("response");

      // calculate global maxScore and numDocsFound
      if (docs.getMaxScore() != null) {
        maxScore = maxScore == null ? docs.getMaxScore() : Math.max(maxScore, docs.getMaxScore());
      }
      numFound += docs.getNumFound();

      NamedList sortFieldValues =
          (NamedList) (srsp.getSolrResponse().getResponse().get("sort_values"));

      // go through every doc in this response, construct a ShardDoc, and
      // put it in the priority queue so it can be ordered.
      for (int i = 0; i < docs.size(); i++) {
        SolrDocument doc = docs.get(i);
        Object id = doc.getFieldValue(uniqueKeyField.getName());

        String prevShard = uniqueDoc.put(id, srsp.getShard());
        if (prevShard != null) {
          // duplicate detected
          numFound--;

          // For now, just always use the first encountered since we can't currently
          // remove the previous one added to the priority queue.  If we switched
          // to the Java5 PriorityQueue, this would be easier.
          continue;
          // make which duplicate is used deterministic based on shard
          // if (prevShard.compareTo(srsp.shard) >= 0) {
          //  TODO: remove previous from priority queue
          //  continue;
          // }
        }

        ShardDoc shardDoc = new ShardDoc();
        shardDoc.id = id;
        shardDoc.shard = srsp.getShard();
        shardDoc.orderInShard = i;
        Object scoreObj = doc.getFieldValue("score");
        if (scoreObj != null) {
          if (scoreObj instanceof String) {
            shardDoc.score = Float.parseFloat((String) scoreObj);
          } else {
            shardDoc.score = (Float) scoreObj;
          }
        }

        shardDoc.sortFieldValues = sortFieldValues;

        queue.insertWithOverflow(shardDoc);
      } // end for-each-doc-in-response
    } // end for-each-response

    // The queue now has 0 -> queuesize docs, where queuesize <= start + rows
    // So we want to pop the last documents off the queue to get
    // the docs offset -> queuesize
    int resultSize = queue.size() - ss.getOffset();
    resultSize = Math.max(0, resultSize); // there may not be any docs in range

    Map<Object, ShardDoc> resultIds = new HashMap<Object, ShardDoc>();
    for (int i = resultSize - 1; i >= 0; i--) {
      ShardDoc shardDoc = (ShardDoc) queue.pop();
      shardDoc.positionInResponse = i;
      // Need the toString() for correlation with other lists that must
      // be strings (like keys in highlighting, explain, etc)
      resultIds.put(shardDoc.id.toString(), shardDoc);
    }

    SolrDocumentList responseDocs = new SolrDocumentList();
    if (maxScore != null) responseDocs.setMaxScore(maxScore);
    responseDocs.setNumFound(numFound);
    responseDocs.setStart(ss.getOffset());
    // size appropriately
    for (int i = 0; i < resultSize; i++) responseDocs.add(null);

    // save these results in a private area so we can access them
    // again when retrieving stored fields.
    // TODO: use ResponseBuilder (w/ comments) or the request context?
    rb.resultIds = resultIds;
    rb._responseDocs = responseDocs;
  }
  /**
   * Accumulates groups for the BlockJoinQuery specified by its slot.
   *
   * @param slot Search query's slot
   * @param offset Parent docs offset
   * @param maxDocsPerGroup Upper bound of documents per group number
   * @param withinGroupOffset Offset within each group of child docs
   * @param withinGroupSort Sort criteria within groups
   * @param fillSortFields Specifies whether to add sort fields or not
   * @return TopGroups for the query specified by slot
   * @throws IOException if there is a low-level I/O error
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  private TopGroups<Integer> accumulateGroups(
      int slot,
      int offset,
      int maxDocsPerGroup,
      int withinGroupOffset,
      Sort withinGroupSort,
      boolean fillSortFields)
      throws IOException {
    final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
    final FakeScorer fakeScorer = new FakeScorer();

    int totalGroupedHitCount = 0;
    // System.out.println("slot=" + slot);

    for (int groupIDX = offset; groupIDX < sortedGroups.length; groupIDX++) {
      final OneGroup og = sortedGroups[groupIDX];
      final int numChildDocs;
      if (slot == -1 || slot >= og.counts.length) {
        numChildDocs = 0;
      } else {
        numChildDocs = og.counts[slot];
      }

      // Number of documents in group should be bounded to prevent redundant memory allocation
      final int numDocsInGroup = Math.max(1, Math.min(numChildDocs, maxDocsPerGroup));
      // System.out.println("parent doc=" + og.doc + " numChildDocs=" + numChildDocs + " maxDocsPG="
      // + maxDocsPerGroup);

      // At this point we hold all docs w/ in each group,
      // unsorted; we now sort them:
      final TopDocsCollector<?> collector;
      if (withinGroupSort == null) {
        // System.out.println("sort by score");
        // Sort by score
        if (!trackScores) {
          throw new IllegalArgumentException(
              "cannot sort by relevance within group: trackScores=false");
        }
        collector = TopScoreDocCollector.create(numDocsInGroup, true);
      } else {
        // Sort by fields
        collector =
            TopFieldCollector.create(
                withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true);
      }

      collector.setScorer(fakeScorer);
      collector.setNextReader(og.readerContext);
      for (int docIDX = 0; docIDX < numChildDocs; docIDX++) {
        // System.out.println("docIDX=" + docIDX + " vs " + og.docs[slot].length);
        final int doc = og.docs[slot][docIDX];
        fakeScorer.doc = doc;
        if (trackScores) {
          fakeScorer.score = og.scores[slot][docIDX];
        }
        collector.collect(doc);
      }
      totalGroupedHitCount += numChildDocs;

      final Object[] groupSortValues;

      if (fillSortFields) {
        groupSortValues = new Object[comparators.length];
        for (int sortFieldIDX = 0; sortFieldIDX < comparators.length; sortFieldIDX++) {
          groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.slot);
        }
      } else {
        groupSortValues = null;
      }

      final TopDocs topDocs = collector.topDocs(withinGroupOffset, numDocsInGroup);

      groups[groupIDX - offset] =
          new GroupDocs<>(
              og.score,
              topDocs.getMaxScore(),
              numChildDocs,
              topDocs.scoreDocs,
              og.doc,
              groupSortValues);
    }

    return new TopGroups<>(
        new TopGroups<>(
            sort.getSort(),
            withinGroupSort == null ? null : withinGroupSort.getSort(),
            0,
            totalGroupedHitCount,
            groups,
            maxScore),
        totalHitCount);
  }
 @Override
 public float idf(long docFreq, long numDocs) {
   return (float) Math.log(numDocs / (double) docFreq);
 }