Exemplo n.º 1
0
  public void handleRequest(SolrQueryRequest req, SolrQueryResponse rsp) {
    numRequests++;

    Query query = null;
    Filter filter = null;

    List<String> commands = StrUtils.splitSmart(req.getQueryString(), ';');

    String qs = commands.size() >= 1 ? commands.get(0) : "";
    query = QueryParsing.parseQuery(qs, req.getSchema());

    // If the first non-query, non-filter command is a simple sort on an indexed field, then
    // we can use the Lucene sort ability.
    Sort sort = null;
    if (commands.size() >= 2) {
      sort = QueryParsing.parseSort(commands.get(1), req);
    }

    try {

      int numHits;
      ScoreDoc[] scoreDocs;
      if (sort != null) {
        TopFieldDocs hits =
            req.getSearcher().search(query, filter, req.getStart() + req.getLimit(), sort);
        scoreDocs = hits.scoreDocs;
        numHits = hits.totalHits;
      } else {
        TopDocs hits = req.getSearcher().search(query, filter, req.getStart() + req.getLimit());
        scoreDocs = hits.scoreDocs;
        numHits = hits.totalHits;
      }

      int startRow = Math.min(numHits, req.getStart());
      int endRow = Math.min(numHits, req.getStart() + req.getLimit());
      int numRows = endRow - startRow;

      int[] ids = new int[numRows];
      Document[] data = new Document[numRows];
      for (int i = startRow; i < endRow; i++) {
        ids[i] = scoreDocs[i].doc;
        data[i] = req.getSearcher().doc(ids[i]);
      }

      rsp.add(null, new DocSlice(0, numRows, ids, null, numHits, 0.0f));

      /**
       * ********************* rsp.setResults(new DocSlice(0,numRows,ids,null,numHits));
       *
       * <p>// Setting the actual document objects is optional rsp.setResults(data);
       * **********************
       */
    } catch (IOException e) {
      rsp.setException(e);
      numErrors++;
      return;
    }
  }
 /**
  * Merges <code>termClaimsDescriptionAbstractTitleQueries</code> into a single query. In the
  * future this method should probably be in <code>Query</code> class. This is akward way of doing
  * it; but only merge queries method that is available is mergeBooleanQueries; so actually have to
  * make a string termClaimsDescriptionAbstractTitle1^boost1,
  * termClaimsDescriptionAbstractTitle2^boost and then parse it into a query
  *
  * @param termQueries - to merge
  * @param maxTerms
  * @return query created from termClaimsDescriptionAbstractTitleQueries including boost parameters
  * @throws org.apache.lucene.queryparser.classic.ParseException
  */
 public Query mergeQueries(List<TermQuery> termQueries, int maxTerms) throws ParseException {
   BooleanQuery query = new BooleanQuery();
   // Select only the maxTerms number of terms
   int termCount = Math.min(termQueries.size(), maxTerms);
   for (int i = 0; i < termCount; i++) {
     TermQuery termQuery = termQueries.get(i);
     query.add(termQuery, BooleanClause.Occur.SHOULD);
   }
   return query;
 }
  /**
   * Adjust termClaimsDescriptionAbstractTitle features of the docs with alpha * query; and beta;
   * and assign weights/boost to termClaimsDescriptionAbstractTitles (tf*idf).
   *
   * @param query
   * @param currentField
   * @param alpha
   * @param beta - factor of the equation
   * @param gamma
   * @param decay
   * @param maxExpandedQueryTerms - maximum number of termClaimsDescriptionAbstractTitles in
   *     expanded query
   * @return expandedQuery with boost factors adjusted using Rocchio's algorithm
   * @throws IOException
   * @throws ParseException
   */
  public Query adjust(
      Query query,
      String currentField,
      float alpha,
      float beta,
      float gamma,
      float decay,
      int maxExpandedQueryTerms)
      throws IOException, ParseException {
    Query expandedQuery;
    // setBoost of docs terms
    Map<String, TermQuery> relevantDocsTerms =
        setBoost(docsTermVectorReldocs, currentField, beta, decay);
    Map<String, TermQuery> irrrelevantDocsTerms =
        setBoost(docsTermVectorIrreldocs, currentField, gamma, decay);
    //        Map<String, TermQuery> relevantDocsTerms = new HashMap<>();
    //        Map<String, TermQuery> irrrelevantDocsTerms = new HashMap<>();
    // setBoost of query terms
    // Get queryTerms from the query

    // combine weights according to expansion formula
    List<TermQuery> expandedQueryTerms =
        combine(new HashMap<String, TermQuery>(), relevantDocsTerms, irrrelevantDocsTerms);
    // Sort by boost=weight
    Comparator comparator = new QueryBoostComparator();
    Collections.sort(expandedQueryTerms, comparator);
    relevantDocsTerms.clear();
    int termCount = Math.min(expandedQueryTerms.size(), maxExpandedQueryTerms);
    for (int i = 0; i < termCount; i++) {
      TermQuery tq = expandedQueryTerms.get(i);
      relevantDocsTerms.put(tq.getTerm().text(), tq);
      System.out.print(tq.getTerm().text() + ", ");
    }
    TermFreqVector queryTermsVector = new TermFreqVector(query);
    Map<String, TermQuery> queryTerms;

    queryTerms = setBoost(queryTermsVector, currentField, alpha);

    //        List<TermQuery> queryTermsList=new ArrayList(queryTerms.values());
    //        Collections.sort(queryTermsList, comparator);
    //        queryTerms.clear();
    //        for(TermQuery tq:queryTermsList){
    //            queryTerms.put(tq.getTerm().text(), tq);
    //        }
    expandedQueryTerms = combine(queryTerms, relevantDocsTerms, new HashMap<String, TermQuery>());
    Collections.sort(expandedQueryTerms, comparator);
    // Create Expanded Query
    expandedQuery = mergeQueries(expandedQueryTerms, Integer.MAX_VALUE);

    return expandedQuery;
  }
Exemplo n.º 4
0
  /**
   * Create a PriorityQueue from a word->tf map.
   *
   * @param words a map of words keyed on the word(String) with Int objects as the values.
   * @param fieldNames an array of field names to override defaults.
   */
  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames)
      throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
    final int limit = Math.min(maxQueryTerms, words.size());
    FreqQ queue = new FreqQ(limit); // will order words by score

    for (String word : words.keySet()) { // for every word
      int tf = words.get(word).x; // term freq in the source doc
      if (minTermFreq > 0 && tf < minTermFreq) {
        continue; // filter out words that don't occur enough times in the source
      }

      // go through all the fields and find the largest document frequency
      String topField = fieldNames[0];
      int docFreq = 0;
      for (String fieldName : fieldNames) {
        int freq = ir.docFreq(new Term(fieldName, word));
        topField = (freq > docFreq) ? fieldName : topField;
        docFreq = (freq > docFreq) ? freq : docFreq;
      }

      if (minDocFreq > 0 && docFreq < minDocFreq) {
        continue; // filter out words that don't occur in enough docs
      }

      if (docFreq > maxDocFreq) {
        continue; // filter out words that occur in too many docs
      }

      if (docFreq == 0) {
        continue; // index update problem?
      }

      float idf = similarity.idf(docFreq, numDocs);
      float score = tf * idf;

      if (queue.size() < limit) {
        // there is still space in the queue
        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
      } else {
        ScoreTerm term = queue.top();
        if (term.score < score) { // update the smallest in the queue in place and update the queue.
          term.update(word, topField, score, idf, docFreq, tf);
          queue.updateTop();
        }
      }
    }
    return queue;
  }
 private String msg(String left, String right) {
   int size = Math.min(left.length(), right.length());
   StringBuilder builder = new StringBuilder("size: " + left.length() + " vs. " + right.length());
   builder.append(" content: <<");
   for (int i = 0; i < size; i++) {
     if (left.charAt(i) == right.charAt(i)) {
       builder.append(left.charAt(i));
     } else {
       builder
           .append(">> ")
           .append("until offset: ")
           .append(i)
           .append(" [")
           .append(left.charAt(i))
           .append(" vs.")
           .append(right.charAt(i))
           .append("] [")
           .append((int) left.charAt(i))
           .append(" vs.")
           .append((int) right.charAt(i))
           .append(']');
       return builder.toString();
     }
   }
   if (left.length() != right.length()) {
     int leftEnd = Math.max(size, left.length()) - 1;
     int rightEnd = Math.max(size, right.length()) - 1;
     builder
         .append(">> ")
         .append("until offset: ")
         .append(size)
         .append(" [")
         .append(left.charAt(leftEnd))
         .append(" vs.")
         .append(right.charAt(rightEnd))
         .append("] [")
         .append((int) left.charAt(leftEnd))
         .append(" vs.")
         .append((int) right.charAt(rightEnd))
         .append(']');
     return builder.toString();
   }
   return "";
 }
 @Override
 public DocScoreList mostSimilar(String phrase, int maxResults, TIntSet validIds)
     throws IOException {
   final TIntDoubleHashMap scores = getConceptVector(phrase, validIds);
   Integer luceneIds[] = ArrayUtils.toObject(scores.keys());
   Arrays.sort(
       luceneIds,
       new Comparator<Integer>() {
         @Override
         public int compare(Integer id1, Integer id2) {
           return -1 * new Double(scores.get(id1)).compareTo(scores.get(id2));
         }
       });
   DocScoreList result = new DocScoreList(Math.min(luceneIds.length, maxResults));
   for (int i = 0; i < result.numDocs(); i++) {
     result.set(i, esaHelper.luceneIdToWpId(luceneIds[i]), scores.get(luceneIds[i]));
   }
   return normalize(result);
 }
Exemplo n.º 7
0
  @Test
  public void testRandom() throws Exception {
    Directory directory = newDirectory();
    final Random r = random();
    final IndexWriterConfig iwc =
        LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r))
            .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
            .setRAMBufferSizeMB(
                scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here
    RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc);
    int numUniqueChildValues = scaledRandomIntBetween(100, 2000);
    String[] childValues = new String[numUniqueChildValues];
    for (int i = 0; i < numUniqueChildValues; i++) {
      childValues[i] = Integer.toString(i);
    }

    IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet();

    int childDocId = 0;
    int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues);
    ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds =
        new ObjectObjectOpenHashMap<>();
    for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) {
      boolean markParentAsDeleted = rarely();
      boolean filterMe = rarely();
      String parent = Integer.toString(parentDocId);
      Document document = new Document();
      document.add(
          new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES));
      document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
      if (markParentAsDeleted) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("delete", "me", Field.Store.NO));
      }
      if (filterMe) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("filter", "me", Field.Store.NO));
      }
      indexWriter.addDocument(document);

      int numChildDocs = scaledRandomIntBetween(0, 100);
      for (int i = 0; i < numChildDocs; i++) {
        boolean markChildAsDeleted = rarely();
        String childValue = childValues[random().nextInt(childValues.length)];

        document = new Document();
        document.add(
            new StringField(
                UidFieldMapper.NAME,
                Uid.createUid("child", Integer.toString(childDocId++)),
                Field.Store.NO));
        document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO));
        document.add(
            new StringField(
                ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO));
        document.add(new StringField("field1", childValue, Field.Store.NO));
        if (markChildAsDeleted) {
          document.add(new StringField("delete", "me", Field.Store.NO));
        }
        indexWriter.addDocument(document);

        if (!markChildAsDeleted) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores;
          if (childValueToParentIds.containsKey(childValue)) {
            parentIdToChildScores = childValueToParentIds.lget();
          } else {
            childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>());
          }
          if (!markParentAsDeleted && !filterMe) {
            FloatArrayList childScores = parentIdToChildScores.get(parent);
            if (childScores == null) {
              parentIdToChildScores.put(parent, childScores = new FloatArrayList());
            }
            childScores.add(1f);
          }
        }
      }
    }

    // Delete docs that are marked to be deleted.
    indexWriter.deleteDocuments(new Term("delete", "me"));
    indexWriter.commit();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    Engine.Searcher engineSearcher =
        new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher);
    ((TestSearchContext) SearchContext.current())
        .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));

    int max = numUniqueChildValues / 4;
    for (int i = 0; i < max; i++) {
      // Simulate a parent update
      if (random().nextBoolean()) {
        final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size();
        int numberOfUpdates =
            RandomInts.randomIntBetween(
                random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5));
        for (int j = 0; j < numberOfUpdates; j++) {
          int parentId;
          do {
            parentId = random().nextInt(numParentDocs);
          } while (filteredOrDeletedDocs.contains(parentId));

          String parentUid = Uid.createUid("parent", Integer.toString(parentId));
          indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid));

          Document document = new Document();
          document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES));
          document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
          indexWriter.addDocument(document);
        }

        indexReader.close();
        indexReader = DirectoryReader.open(indexWriter.w, true);
        searcher = new IndexSearcher(indexReader);
        engineSearcher =
            new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
        ((TestSearchContext) SearchContext.current())
            .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));
      }

      String childValue = childValues[random().nextInt(numUniqueChildValues)];
      int shortCircuitParentDocSet = random().nextInt(numParentDocs);
      ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)];
      // leave min/max set to 0 half the time
      int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110);
      int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110);

      QueryBuilder queryBuilder =
          hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue)))
              .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH))
              .minChildren(minChildren)
              .maxChildren(maxChildren)
              .setShortCircuitCutoff(shortCircuitParentDocSet);
      // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not
      // get live docs as acceptedDocs
      queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me")));
      Query query = parseQuery(queryBuilder);
      BitSetCollector collector = new BitSetCollector(indexReader.maxDoc());
      int numHits = 1 + random().nextInt(25);
      TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits);
      searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector));
      FixedBitSet actualResult = collector.getResult();

      FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc());
      TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits);
      if (childValueToParentIds.containsKey(childValue)) {
        LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader);
        final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()];
        Terms terms = slowLeafReader.terms(UidFieldMapper.NAME);
        if (terms != null) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget();
          TermsEnum termsEnum = terms.iterator(null);
          DocsEnum docsEnum = null;
          for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) {
            int count = entry.getValue().elementsCount;
            if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) {
              TermsEnum.SeekStatus seekStatus =
                  termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey()));
              if (seekStatus == TermsEnum.SeekStatus.FOUND) {
                docsEnum =
                    termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
                expectedResult.set(docsEnum.nextDoc());
                scores[docsEnum.docID()] = new FloatArrayList(entry.getValue());
              } else if (seekStatus == TermsEnum.SeekStatus.END) {
                break;
              }
            }
          }
        }
        MockScorer mockScorer = new MockScorer(scoreType);
        final LeafCollector leafCollector =
            expectedTopDocsCollector.getLeafCollector(slowLeafReader.getContext());
        leafCollector.setScorer(mockScorer);
        for (int doc = expectedResult.nextSetBit(0);
            doc < slowLeafReader.maxDoc();
            doc =
                doc + 1 >= expectedResult.length()
                    ? DocIdSetIterator.NO_MORE_DOCS
                    : expectedResult.nextSetBit(doc + 1)) {
          mockScorer.scores = scores[doc];
          leafCollector.collect(doc);
        }
      }

      assertBitSet(actualResult, expectedResult, searcher);
      assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs());
    }

    indexWriter.close();
    indexReader.close();
    directory.close();
  }
  /**
   * Accumulates groups for the BlockJoinQuery specified by its slot.
   *
   * @param slot Search query's slot
   * @param offset Parent docs offset
   * @param maxDocsPerGroup Upper bound of documents per group number
   * @param withinGroupOffset Offset within each group of child docs
   * @param withinGroupSort Sort criteria within groups
   * @param fillSortFields Specifies whether to add sort fields or not
   * @return TopGroups for the query specified by slot
   * @throws IOException if there is a low-level I/O error
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  private TopGroups<Integer> accumulateGroups(
      int slot,
      int offset,
      int maxDocsPerGroup,
      int withinGroupOffset,
      Sort withinGroupSort,
      boolean fillSortFields)
      throws IOException {
    final GroupDocs<Integer>[] groups = new GroupDocs[sortedGroups.length - offset];
    final FakeScorer fakeScorer = new FakeScorer();

    int totalGroupedHitCount = 0;
    // System.out.println("slot=" + slot);

    for (int groupIDX = offset; groupIDX < sortedGroups.length; groupIDX++) {
      final OneGroup og = sortedGroups[groupIDX];
      final int numChildDocs;
      if (slot == -1 || slot >= og.counts.length) {
        numChildDocs = 0;
      } else {
        numChildDocs = og.counts[slot];
      }

      // Number of documents in group should be bounded to prevent redundant memory allocation
      final int numDocsInGroup = Math.max(1, Math.min(numChildDocs, maxDocsPerGroup));
      // System.out.println("parent doc=" + og.doc + " numChildDocs=" + numChildDocs + " maxDocsPG="
      // + maxDocsPerGroup);

      // At this point we hold all docs w/ in each group,
      // unsorted; we now sort them:
      final TopDocsCollector<?> collector;
      if (withinGroupSort == null) {
        // System.out.println("sort by score");
        // Sort by score
        if (!trackScores) {
          throw new IllegalArgumentException(
              "cannot sort by relevance within group: trackScores=false");
        }
        collector = TopScoreDocCollector.create(numDocsInGroup, true);
      } else {
        // Sort by fields
        collector =
            TopFieldCollector.create(
                withinGroupSort, numDocsInGroup, fillSortFields, trackScores, trackMaxScore, true);
      }

      collector.setScorer(fakeScorer);
      collector.setNextReader(og.readerContext);
      for (int docIDX = 0; docIDX < numChildDocs; docIDX++) {
        // System.out.println("docIDX=" + docIDX + " vs " + og.docs[slot].length);
        final int doc = og.docs[slot][docIDX];
        fakeScorer.doc = doc;
        if (trackScores) {
          fakeScorer.score = og.scores[slot][docIDX];
        }
        collector.collect(doc);
      }
      totalGroupedHitCount += numChildDocs;

      final Object[] groupSortValues;

      if (fillSortFields) {
        groupSortValues = new Object[comparators.length];
        for (int sortFieldIDX = 0; sortFieldIDX < comparators.length; sortFieldIDX++) {
          groupSortValues[sortFieldIDX] = comparators[sortFieldIDX].value(og.slot);
        }
      } else {
        groupSortValues = null;
      }

      final TopDocs topDocs = collector.topDocs(withinGroupOffset, numDocsInGroup);

      groups[groupIDX - offset] =
          new GroupDocs<>(
              og.score,
              topDocs.getMaxScore(),
              numChildDocs,
              topDocs.scoreDocs,
              og.doc,
              groupSortValues);
    }

    return new TopGroups<>(
        new TopGroups<>(
            sort.getSort(),
            withinGroupSort == null ? null : withinGroupSort.getSort(),
            0,
            totalGroupedHitCount,
            groups,
            maxScore),
        totalHitCount);
  }
 private static long optionalSum(long left, long right) {
   return Math.min(left, right) == -1 ? -1 : left + right;
 }