コード例 #1
0
 /**
  * Returns a count of the documents in the set which do not have any terms for for the specified
  * field.
  *
  * @see FacetParams#FACET_MISSING
  */
 public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName)
     throws IOException {
   SchemaField sf = searcher.getSchema().getField(fieldName);
   DocSet hasVal =
       searcher.getDocSet(sf.getType().getRangeQuery(null, sf, null, null, false, false));
   return docs.andNotSize(hasVal);
 }
コード例 #2
0
 static Query getFieldMissingQuery(SolrIndexSearcher searcher, String fieldName)
     throws IOException {
   SchemaField sf = searcher.getSchema().getField(fieldName);
   Query hasVal = sf.getType().getRangeQuery(null, sf, null, null, false, false);
   BooleanQuery.Builder noVal = new BooleanQuery.Builder();
   noVal.add(hasVal, BooleanClause.Occur.MUST_NOT);
   return noVal.build();
 }
コード例 #3
0
 @SuppressWarnings("unused")
 static DocSet getFieldMissing(SolrIndexSearcher searcher, DocSet docs, String fieldName)
     throws IOException {
   SchemaField sf = searcher.getSchema().getField(fieldName);
   DocSet hasVal =
       searcher.getDocSet(sf.getType().getRangeQuery(null, sf, null, null, false, false));
   DocSet answer = docs.andNot(hasVal);
   // hasVal.decref(); // OFF-HEAP
   return answer;
 }
コード例 #4
0
 /**
  * Computes the term->count counts for the specified term values relative to the
  *
  * @param field the name of the field to compute term counts against
  * @param parsed contains the docset to compute term counts relative to
  * @param terms a list of term values (in the specified field) to compute the counts for
  */
 protected NamedList<Integer> getListedTermCounts(
     String field, final ParsedParams parsed, List<String> terms) throws IOException {
   FieldType ft = searcher.getSchema().getFieldType(field);
   NamedList<Integer> res = new NamedList<>();
   for (String term : terms) {
     String internal = ft.toInternal(term);
     int count = searcher.numDocs(new TermQuery(new Term(field, internal)), parsed.docs);
     res.add(term, count);
   }
   return res;
 }
コード例 #5
0
  public static SolrReaderSetScorer createReaderSetScorer(
      Weight weight,
      AtomicReaderContext context,
      Bits acceptDocs,
      SolrIndexSearcher searcher,
      String authorities,
      AtomicReader reader)
      throws IOException {

    DocSet readableDocSet =
        (DocSet) searcher.cacheLookup(CacheConstants.ALFRESCO_READER_CACHE, authorities);

    if (readableDocSet == null) {

      String[] auths = authorities.substring(1).split(authorities.substring(0, 1));

      readableDocSet = new BitDocSet(new FixedBitSet(searcher.maxDoc()));

      BooleanQuery bQuery = new BooleanQuery();
      for (String current : auths) {
        bQuery.add(new TermQuery(new Term(QueryConstants.FIELD_READER, current)), Occur.SHOULD);
      }

      DocSet aclDocs = searcher.getDocSet(bQuery);

      BooleanQuery aQuery = new BooleanQuery();
      for (DocIterator it = aclDocs.iterator(); it.hasNext(); /**/ ) {
        int docID = it.nextDoc();
        // Obtain the ACL ID for this ACL doc.
        long aclID =
            searcher.getAtomicReader().getNumericDocValues(QueryConstants.FIELD_ACLID).get(docID);
        SchemaField schemaField = searcher.getSchema().getField(QueryConstants.FIELD_ACLID);
        Query query = schemaField.getType().getFieldQuery(null, schemaField, Long.toString(aclID));
        aQuery.add(query, Occur.SHOULD);

        if ((aQuery.clauses().size() > 999) || !it.hasNext()) {
          DocSet docsForAclId = searcher.getDocSet(aQuery);
          readableDocSet = readableDocSet.union(docsForAclId);

          aQuery = new BooleanQuery();
        }
      }
      // Exclude the ACL docs from the results, we only want real docs that match.
      // Probably not very efficient, what we really want is remove(docID)
      readableDocSet = readableDocSet.andNot(aclDocs);
      searcher.cacheInsert(CacheConstants.ALFRESCO_READER_CACHE, authorities, readableDocSet);
    }

    // TODO: cache the full set? e.g. searcher.cacheInsert(CacheConstants.ALFRESCO_READERSET_CACHE,
    // authorities, readableDocSet)
    // plus check of course, for presence in cache at start of method.
    return new SolrReaderSetScorer(weight, readableDocSet, context, acceptDocs, searcher);
  }
コード例 #6
0
  /**
   * Generates a list of Highlighted query fragments for each item in a list of documents, or
   * returns null if highlighting is disabled.
   *
   * @param docs query results
   * @param query the query
   * @param req the current request
   * @param defaultFields default list of fields to summarize
   * @return NamedList containing a NamedList for each document, which in turns contains sets
   *     (field, summary) pairs.
   */
  @Override
  @SuppressWarnings("unchecked")
  public NamedList<Object> doHighlighting(
      DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
    SolrParams params = req.getParams();
    if (!isHighlightingEnabled(params)) return null;

    SolrIndexSearcher searcher = req.getSearcher();
    IndexSchema schema = searcher.getSchema();
    NamedList fragments = new SimpleOrderedMap();
    String[] fieldNames = getHighlightFields(query, req, defaultFields);
    Set<String> fset = new HashSet<String>();

    {
      // pre-fetch documents using the Searcher's doc cache
      for (String f : fieldNames) {
        fset.add(f);
      }
      // fetch unique key if one exists.
      SchemaField keyField = schema.getUniqueKeyField();
      if (null != keyField) fset.add(keyField.getName());
    }

    // get FastVectorHighlighter instance out of the processing loop
    FastVectorHighlighter fvh =
        new FastVectorHighlighter(
            // FVH cannot process hl.usePhraseHighlighter parameter per-field basis
            params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true),
            // FVH cannot process hl.requireFieldMatch parameter per-field basis
            params.getBool(HighlightParams.FIELD_MATCH, false));
    fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, Integer.MAX_VALUE));
    FieldQuery fieldQuery = fvh.getFieldQuery(query, searcher.getIndexReader());

    // Highlight each document
    DocIterator iterator = docs.iterator();
    for (int i = 0; i < docs.size(); i++) {
      int docId = iterator.nextDoc();
      Document doc = searcher.doc(docId, fset);
      NamedList docSummaries = new SimpleOrderedMap();
      for (String fieldName : fieldNames) {
        fieldName = fieldName.trim();
        if (useFastVectorHighlighter(params, schema, fieldName))
          doHighlightingByFastVectorHighlighter(
              fvh, fieldQuery, req, docSummaries, docId, doc, fieldName);
        else doHighlightingByHighlighter(query, req, docSummaries, docId, doc, fieldName);
      }
      String printId = schema.printableUniqueKey(doc);
      fragments.add(printId == null ? null : printId, docSummaries);
    }
    return fragments;
  }
コード例 #7
0
  private void mockDocument(int docId, double price, double discount, boolean isCloseout)
      throws IOException {
    Document doc = PowerMockito.mock(Document.class);
    when(searcher.doc(eq(docId), any(Set.class))).thenReturn(doc);
    when(searcher.getSchema()).thenReturn(schema);

    IndexableField priceField = mock(IndexableField.class);
    when(doc.getField(FIELD_PRICE)).thenReturn(priceField);
    when(priceField.numericValue()).thenReturn(new Double(price));

    IndexableField discountField = mock(IndexableField.class);
    when(doc.getField(FIELD_DISCOUNT)).thenReturn(discountField);
    when(discountField.numericValue()).thenReturn(new Double(discount));

    IndexableField closeoutField = mock(IndexableField.class);
    when(doc.getField(FIELD_CLOSEOUT)).thenReturn(closeoutField);
    when(closeoutField.stringValue()).thenReturn(isCloseout ? "T" : "F");
  }
コード例 #8
0
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) return;

    SolrIndexSearcher searcher = rb.req.getSearcher();
    IndexSchema schema = searcher.getSchema();
    if (schema.getUniqueKeyField() == null) return;

    ResultContext rc = (ResultContext) rb.rsp.getResponse();

    DocList docs = rc.getDocList();
    if (docs.hasScores()) {
      processScores(rb, docs, schema, searcher);
    } else {
      processIds(rb, docs, schema, searcher);
    }
  }
コード例 #9
0
  @Override
  public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
    final int off = readerContext.docBase;
    final LeafReader r;
    Object o = context.get("searcher");
    if (o instanceof SolrIndexSearcher) {
      SolrIndexSearcher is = (SolrIndexSearcher) o;
      SchemaField sf = is.getSchema().getFieldOrNull(field);
      if (sf != null
          && sf.hasDocValues() == false
          && sf.multiValued() == false
          && sf.getType().getNumericType() != null) {
        // it's a single-valued numeric field: we must currently create insanity :(
        List<LeafReaderContext> leaves = is.getIndexReader().leaves();
        LeafReader insaneLeaves[] = new LeafReader[leaves.size()];
        int upto = 0;
        for (LeafReaderContext raw : leaves) {
          insaneLeaves[upto++] = Insanity.wrapInsanity(raw.reader(), field);
        }
        r = SlowCompositeReaderWrapper.wrap(new MultiReader(insaneLeaves));
      } else {
        // reuse ordinalmap
        r = ((SolrIndexSearcher) o).getLeafReader();
      }
    } else {
      IndexReader topReader = ReaderUtil.getTopLevelContext(readerContext).reader();
      r = SlowCompositeReaderWrapper.wrap(topReader);
    }
    // if it's e.g. tokenized/multivalued, emulate old behavior of single-valued fc
    final SortedDocValues sindex =
        SortedSetSelector.wrap(DocValues.getSortedSet(r, field), SortedSetSelector.Type.MIN);
    final int end = sindex.getValueCount();

    return new IntDocValues(this) {
      @Override
      public int intVal(int doc) {
        return (end - sindex.getOrd(doc + off) - 1);
      }
    };
  }
コード例 #10
0
  private void doHighlightingByHighlighter(
      Query query,
      SolrQueryRequest req,
      NamedList docSummaries,
      int docId,
      Document doc,
      String fieldName)
      throws IOException {
    SolrParams params = req.getParams();
    String[] docTexts = doc.getValues(fieldName);
    // according to Document javadoc, doc.getValues() never returns null. check empty instead of
    // null
    if (docTexts.length == 0) return;

    SolrIndexSearcher searcher = req.getSearcher();
    IndexSchema schema = searcher.getSchema();
    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    try {
      TokenStream tvStream =
          TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
      if (tvStream != null) {
        tots = new TermOffsetsTokenStream(tvStream);
      }
    } catch (IllegalArgumentException e) {
      // No problem. But we can't use TermOffsets optimization.
    }

    for (int j = 0; j < docTexts.length; j++) {
      if (tots != null) {
        // if we're using TermOffsets optimization, then get the next
        // field value's TokenStream (i.e. get field j's TokenStream) from tots:
        tstream = tots.getMultiValuedTokenStream(docTexts[j].length());
      } else {
        // fall back to analyzer
        tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
      }

      Highlighter highlighter;
      if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
        // TODO: this is not always necessary - eventually we would like to avoid this wrap
        //       when it is not needed.
        tstream = new CachingTokenFilter(tstream);

        // get highlighter
        highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

        // after highlighter initialization, reset tstream since construction of highlighter already
        // used it
        tstream.reset();
      } else {
        // use "the old way"
        highlighter = getHighlighter(query, fieldName, req);
      }

      int maxCharsToAnalyze =
          params.getFieldInt(
              fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
      if (maxCharsToAnalyze < 0) {
        highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
      } else {
        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
      }

      try {
        TextFragment[] bestTextFragments =
            highlighter.getBestTextFragments(
                tstream, docTexts[j], mergeContiguousFragments, numFragments);
        for (int k = 0; k < bestTextFragments.length; k++) {
          if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
            frags.add(bestTextFragments[k]);
          }
        }
      } catch (InvalidTokenOffsetsException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }
    }
    // sort such that the fragments with the highest score come first
    Collections.sort(
        frags,
        new Comparator<TextFragment>() {
          public int compare(TextFragment arg0, TextFragment arg1) {
            return Math.round(arg1.getScore() - arg0.getScore());
          }
        });

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
      ArrayList<String> fragTexts = new ArrayList<String>();
      for (TextFragment fragment : frags) {
        if ((fragment != null) && (fragment.getScore() > 0)) {
          fragTexts.add(fragment.toString());
        }
        if (fragTexts.size() >= numFragments) break;
      }
      summaries = fragTexts.toArray(new String[0]);
      if (summaries.length > 0) docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
      alternateField(docSummaries, params, doc, fieldName);
    }
  }
コード例 #11
0
  /**
   * Returns a list of terms in the specified field along with the corresponding count of documents
   * in the set that match that constraint. This method uses the FilterCache to get the intersection
   * count between <code>docs</code> and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
  public NamedList<Integer> getFacetTermEnumCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String field,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase,
      SolrParams params)
      throws IOException {

    /* :TODO: potential optimization...
     * cache the Terms with the highest docFreq and try them first
     * don't enum if we get our max from them
     */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
      SortedIntDocSet sset = (SortedIntDocSet) docs;
      fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }

    IndexSchema schema = searcher.getSchema();
    LeafReader r = searcher.getLeafReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
        sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef prefixTermBytes = null;
    if (prefix != null) {
      String indexedPrefix = ft.toInternal(prefix);
      prefixTermBytes = new BytesRef(indexedPrefix);
    }

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (prefixTermBytes != null) {
        if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }
    }

    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();

    if (docs.size() >= mincount) {
      while (term != null) {

        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;

        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
          int df = termsEnum.docFreq();

          // If we are sorting, we can use df>min (rather than >=) since we
          // are going in index order.  For certain term distributions this can
          // make a large difference (for example, many terms with df=1).
          if (df > 0 && df > min) {
            int c;

            if (df >= minDfFilterCache) {
              // use the filter cache

              if (deState == null) {
                deState = new SolrIndexSearcher.DocsEnumState();
                deState.fieldName = field;
                deState.liveDocs = r.getLiveDocs();
                deState.termsEnum = termsEnum;
                deState.postingsEnum = postingsEnum;
              }

              c = searcher.numDocs(docs, deState);

              postingsEnum = deState.postingsEnum;
            } else {
              // iterate over TermDocs to calculate the intersection

              // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it
              // matter for this?
              // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
              // impl)
              // TODO: would passing deleted docs lead to better efficiency over checking the
              // fastForRandomSet?
              postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
              c = 0;

              if (postingsEnum instanceof MultiPostingsEnum) {
                MultiPostingsEnum.EnumWithSlice[] subs =
                    ((MultiPostingsEnum) postingsEnum).getSubs();
                int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                for (int subindex = 0; subindex < numSubs; subindex++) {
                  MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                  if (sub.postingsEnum == null) continue;
                  int base = sub.slice.start;
                  int docid;
                  while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    if (fastForRandomSet.exists(docid + base)) c++;
                  }
                }
              } else {
                int docid;
                while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.exists(docid)) c++;
                }
              }
            }

            if (sortByCount) {
              if (c > min) {
                BytesRef termCopy = BytesRef.deepCopyOf(term);
                queue.add(new CountPair<>(termCopy, c));
                if (queue.size() >= maxsize) min = queue.last().val;
              }
            } else {
              if (c >= mincount && --off < 0) {
                if (--lim < 0) break;
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
              }
            }
          }
        }
        term = termsEnum.next();
      }
    }

    if (sortByCount) {
      for (CountPair<BytesRef, Integer> p : queue) {
        if (--off >= 0) continue;
        if (--lim < 0) break;
        ft.indexedToReadable(p.key, charsRef);
        res.add(charsRef.toString(), p.val);
      }
    }

    if (missing) {
      res.add(null, getFieldMissingCount(searcher, docs, field));
    }

    return res;
  }
コード例 #12
0
  public static NamedList<Integer> getCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String fieldName,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix)
      throws IOException {
    SchemaField schemaField = searcher.getSchema().getField(fieldName);
    FieldType ft = schemaField.getType();
    NamedList<Integer> res = new NamedList<Integer>();

    final SortedSetDocValues si; // for term lookups only
    OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
    if (schemaField.multiValued()) {
      si = searcher.getAtomicReader().getSortedSetDocValues(fieldName);
      if (si instanceof MultiSortedSetDocValues) {
        ordinalMap = ((MultiSortedSetDocValues) si).mapping;
      }
    } else {
      SortedDocValues single = searcher.getAtomicReader().getSortedDocValues(fieldName);
      si = single == null ? null : new SingletonSortedSetDocValues(single);
      if (single instanceof MultiSortedDocValues) {
        ordinalMap = ((MultiSortedDocValues) single).mapping;
      }
    }
    if (si == null) {
      return finalize(res, searcher, schemaField, docs, -1, missing);
    }
    if (si.getValueCount() >= Integer.MAX_VALUE) {
      throw new UnsupportedOperationException(
          "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms");
    }

    final BytesRef br = new BytesRef();

    final BytesRef prefixRef;
    if (prefix == null) {
      prefixRef = null;
    } else if (prefix.length() == 0) {
      prefix = null;
      prefixRef = null;
    } else {
      prefixRef = new BytesRef(prefix);
    }

    int startTermIndex, endTermIndex;
    if (prefix != null) {
      startTermIndex = (int) si.lookupTerm(prefixRef);
      if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
      prefixRef.append(UnicodeUtil.BIG_TERM);
      endTermIndex = (int) si.lookupTerm(prefixRef);
      assert endTermIndex < 0;
      endTermIndex = -endTermIndex - 1;
    } else {
      startTermIndex = -1;
      endTermIndex = (int) si.getValueCount();
    }

    final int nTerms = endTermIndex - startTermIndex;
    int missingCount = -1;
    final CharsRef charsRef = new CharsRef(10);
    if (nTerms > 0 && docs.size() >= mincount) {

      // count collection array only needs to be as big as the number of terms we are
      // going to collect counts for.
      final int[] counts = new int[nTerms];

      Filter filter = docs.getTopFilter();
      List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
      for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
        AtomicReaderContext leaf = leaves.get(subIndex);
        DocIdSet dis =
            filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
        DocIdSetIterator disi = null;
        if (dis != null) {
          disi = dis.iterator();
        }
        if (disi != null) {
          if (schemaField.multiValued()) {
            SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
            if (sub == null) {
              sub = SortedSetDocValues.EMPTY;
            }
            if (sub instanceof SingletonSortedSetDocValues) {
              // some codecs may optimize SORTED_SET storage for single-valued fields
              final SortedDocValues values =
                  ((SingletonSortedSetDocValues) sub).getSortedDocValues();
              accumSingle(counts, startTermIndex, values, disi, subIndex, ordinalMap);
            } else {
              accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
            }
          } else {
            SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
            if (sub == null) {
              sub = SortedDocValues.EMPTY;
            }
            accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
          }
        }
      }

      if (startTermIndex == -1) {
        missingCount = counts[0];
      }

      // IDEA: we could also maintain a count of "other"... everything that fell outside
      // of the top 'N'

      int off = offset;
      int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

      if (sort.equals(FacetParams.FACET_SORT_COUNT)
          || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
        int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
        maxsize = Math.min(maxsize, nTerms);
        LongPriorityQueue queue =
            new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE);

        int min = mincount - 1; // the smallest value in the top 'N' values
        for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
          int c = counts[i];
          if (c > min) {
            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).

            // smaller term numbers sort higher, so subtract the term number instead
            long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i);
            boolean displaced = queue.insert(pair);
            if (displaced) min = (int) (queue.top() >>> 32);
          }
        }

        // if we are deep paging, we don't have to order the highest "offset" counts.
        int collectCount = Math.max(0, queue.size() - off);
        assert collectCount <= lim;

        // the start and end indexes of our list "sorted" (starting with the highest value)
        int sortedIdxStart = queue.size() - (collectCount - 1);
        int sortedIdxEnd = queue.size() + 1;
        final long[] sorted = queue.sort(collectCount);

        for (int i = sortedIdxStart; i < sortedIdxEnd; i++) {
          long pair = sorted[i];
          int c = (int) (pair >>> 32);
          int tnum = Integer.MAX_VALUE - (int) pair;
          si.lookupOrd(startTermIndex + tnum, br);
          ft.indexedToReadable(br, charsRef);
          res.add(charsRef.toString(), c);
        }

      } else {
        // add results in index order
        int i = (startTermIndex == -1) ? 1 : 0;
        if (mincount <= 0) {
          // if mincount<=0, then we won't discard any terms and we know exactly
          // where to start.
          i += off;
          off = 0;
        }

        for (; i < nTerms; i++) {
          int c = counts[i];
          if (c < mincount || --off >= 0) continue;
          if (--lim < 0) break;
          si.lookupOrd(startTermIndex + i, br);
          ft.indexedToReadable(br, charsRef);
          res.add(charsRef.toString(), c);
        }
      }
    }

    return finalize(res, searcher, schemaField, docs, missingCount, missing);
  }
コード例 #13
0
  private void doHighlightingByHighlighter(
      Query query,
      SolrQueryRequest req,
      NamedList docSummaries,
      int docId,
      Document doc,
      String fieldName)
      throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null
        && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return;
    // END: Hack

    SolrParams params = req.getParams();
    IndexableField[] docFields = doc.getFields(fieldName);
    List<String> listFields = new ArrayList<String>();
    for (IndexableField field : docFields) {
      listFields.add(field.stringValue());
    }

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    String[] docTexts = (String[]) listFields.toArray(new String[listFields.size()]);

    // according to Document javadoc, doc.getValues() never returns null. check empty instead of
    // null
    if (docTexts.length == 0) return;

    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    TokenStream tvStream =
        TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
    if (tvStream != null) {
      tots = new TermOffsetsTokenStream(tvStream);
    }

    for (int j = 0; j < docTexts.length; j++) {
      if (tots != null) {
        // if we're using TermOffsets optimization, then get the next
        // field value's TokenStream (i.e. get field j's TokenStream) from tots:
        tstream = tots.getMultiValuedTokenStream(docTexts[j].length());
      } else {
        // fall back to analyzer
        tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
      }

      int maxCharsToAnalyze =
          params.getFieldInt(
              fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

      Highlighter highlighter;
      if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
        if (maxCharsToAnalyze < 0) {
          tstream = new CachingTokenFilter(tstream);
        } else {
          tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
        }

        // get highlighter
        highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

        // after highlighter initialization, reset tstream since construction of highlighter already
        // used it
        tstream.reset();
      } else {
        // use "the old way"
        highlighter = getHighlighter(query, fieldName, req);
      }

      if (maxCharsToAnalyze < 0) {
        highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
      } else {
        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
      }

      try {
        TextFragment[] bestTextFragments =
            highlighter.getBestTextFragments(
                tstream, docTexts[j], mergeContiguousFragments, numFragments);
        for (int k = 0; k < bestTextFragments.length; k++) {
          if (preserveMulti) {
            if (bestTextFragments[k] != null) {
              frags.add(bestTextFragments[k]);
            }
          } else {
            if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
              frags.add(bestTextFragments[k]);
            }
          }
        }
      } catch (InvalidTokenOffsetsException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }
    }
    // sort such that the fragments with the highest score come first
    if (!preserveMulti) {
      Collections.sort(
          frags,
          new Comparator<TextFragment>() {
            @Override
            public int compare(TextFragment arg0, TextFragment arg1) {
              return Math.round(arg1.getScore() - arg0.getScore());
            }
          });
    }

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
      ArrayList<String> fragTexts = new ArrayList<String>();
      for (TextFragment fragment : frags) {
        if (preserveMulti) {
          if (fragment != null) {
            fragTexts.add(fragment.toString());
          }
        } else {
          if ((fragment != null) && (fragment.getScore() > 0)) {
            fragTexts.add(fragment.toString());
          }
        }

        if (fragTexts.size() >= numFragments && !preserveMulti) break;
      }
      summaries = fragTexts.toArray(new String[0]);
      if (summaries.length > 0) docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
      alternateField(docSummaries, params, doc, fieldName);
    }
  }
コード例 #14
0
  public NamedList<Integer> getGroupedCounts(
      SolrIndexSearcher searcher,
      DocSet base,
      String field,
      boolean multiToken,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase)
      throws IOException {
    GroupingSpecification groupingSpecification = rb.getGroupingSpec();
    final String groupField =
        groupingSpecification != null ? groupingSpecification.getFields()[0] : null;
    if (groupField == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Specify the group.field as parameter or local parameter");
    }

    BytesRef prefixBytesRef = prefix != null ? new BytesRef(prefix) : null;
    final TermGroupFacetCollector collector =
        TermGroupFacetCollector.createTermGroupFacetCollector(
            groupField, field, multiToken, prefixBytesRef, 128);

    SchemaField sf = searcher.getSchema().getFieldOrNull(groupField);

    if (sf != null
        && sf.hasDocValues() == false
        && sf.multiValued() == false
        && sf.getType().getNumericType() != null) {
      // it's a single-valued numeric field: we must currently create insanity :(
      // there isn't a GroupedFacetCollector that works on numerics right now...
      searcher.search(
          base.getTopFilter(),
          new FilterCollector(collector) {
            @Override
            public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
              LeafReader insane = Insanity.wrapInsanity(context.reader(), groupField);
              return in.getLeafCollector(insane.getContext());
            }
          });
    } else {
      searcher.search(base.getTopFilter(), collector);
    }

    boolean orderByCount =
        sort.equals(FacetParams.FACET_SORT_COUNT)
            || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY);
    TermGroupFacetCollector.GroupedFacetResult result =
        collector.mergeSegmentResults(
            limit < 0 ? Integer.MAX_VALUE : (offset + limit), mincount, orderByCount);

    CharsRefBuilder charsRef = new CharsRefBuilder();
    FieldType facetFieldType = searcher.getSchema().getFieldType(field);
    NamedList<Integer> facetCounts = new NamedList<>();
    List<TermGroupFacetCollector.FacetEntry> scopedEntries =
        result.getFacetEntries(offset, limit < 0 ? Integer.MAX_VALUE : limit);
    for (TermGroupFacetCollector.FacetEntry facetEntry : scopedEntries) {
      // :TODO:can we do contains earlier than this to make it more efficient?
      if (contains != null
          && !contains(facetEntry.getValue().utf8ToString(), contains, ignoreCase)) {
        continue;
      }
      facetFieldType.indexedToReadable(facetEntry.getValue(), charsRef);
      facetCounts.add(charsRef.toString(), facetEntry.getCount());
    }

    if (missing) {
      facetCounts.add(null, result.getTotalMissingCount());
    }

    return facetCounts;
  }
コード例 #15
0
  /**
   * Term counts for use in field faceting that resepcts the specified mincount - if mincount is
   * null, the "zeros" param is consulted for the appropriate backcompat default
   *
   * @see FacetParams#FACET_ZEROS
   */
  private NamedList<Integer> getTermCounts(String field, Integer mincount, ParsedParams parsed)
      throws IOException {
    final SolrParams params = parsed.params;
    final DocSet docs = parsed.docs;
    final int threads = parsed.threads;
    int offset = params.getFieldInt(field, FacetParams.FACET_OFFSET, 0);
    int limit = params.getFieldInt(field, FacetParams.FACET_LIMIT, 100);
    if (limit == 0) return new NamedList<>();
    if (mincount == null) {
      Boolean zeros = params.getFieldBool(field, FacetParams.FACET_ZEROS);
      // mincount = (zeros!=null && zeros) ? 0 : 1;
      mincount = (zeros != null && !zeros) ? 1 : 0;
      // current default is to include zeros.
    }
    boolean missing = params.getFieldBool(field, FacetParams.FACET_MISSING, false);
    // default to sorting if there is a limit.
    String sort =
        params.getFieldParam(
            field,
            FacetParams.FACET_SORT,
            limit > 0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX);
    String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX);
    String contains = params.getFieldParam(field, FacetParams.FACET_CONTAINS);
    boolean ignoreCase = params.getFieldBool(field, FacetParams.FACET_CONTAINS_IGNORE_CASE, false);

    NamedList<Integer> counts;
    SchemaField sf = searcher.getSchema().getField(field);
    FieldType ft = sf.getType();

    // determine what type of faceting method to use
    final String methodStr = params.getFieldParam(field, FacetParams.FACET_METHOD);
    FacetMethod method = null;
    if (FacetParams.FACET_METHOD_enum.equals(methodStr)) {
      method = FacetMethod.ENUM;
    } else if (FacetParams.FACET_METHOD_fcs.equals(methodStr)) {
      method = FacetMethod.FCS;
    } else if (FacetParams.FACET_METHOD_fc.equals(methodStr)) {
      method = FacetMethod.FC;
    }

    if (method == FacetMethod.ENUM && TrieField.getMainValuePrefix(ft) != null) {
      // enum can't deal with trie fields that index several terms per value
      method = sf.multiValued() ? FacetMethod.FC : FacetMethod.FCS;
    }

    if (method == null && ft instanceof BoolField) {
      // Always use filters for booleans... we know the number of values is very small.
      method = FacetMethod.ENUM;
    }

    final boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache();

    if (ft.getNumericType() != null && !sf.multiValued()) {
      // the per-segment approach is optimal for numeric field types since there
      // are no global ords to merge and no need to create an expensive
      // top-level reader
      method = FacetMethod.FCS;
    }

    if (method == null) {
      // TODO: default to per-segment or not?
      method = FacetMethod.FC;
    }

    if (method == FacetMethod.FCS && multiToken) {
      // only fc knows how to deal with multi-token fields
      method = FacetMethod.FC;
    }

    if (method == FacetMethod.ENUM && sf.hasDocValues()) {
      // only fc can handle docvalues types
      method = FacetMethod.FC;
    }

    if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) {
      counts =
          getGroupedCounts(
              searcher,
              docs,
              field,
              multiToken,
              offset,
              limit,
              mincount,
              missing,
              sort,
              prefix,
              contains,
              ignoreCase);
    } else {
      assert method != null;
      switch (method) {
        case ENUM:
          assert TrieField.getMainValuePrefix(ft) == null;
          counts =
              getFacetTermEnumCounts(
                  searcher,
                  docs,
                  field,
                  offset,
                  limit,
                  mincount,
                  missing,
                  sort,
                  prefix,
                  contains,
                  ignoreCase,
                  params);
          break;
        case FCS:
          assert !multiToken;
          if (ft.getNumericType() != null && !sf.multiValued()) {
            // force numeric faceting
            if (prefix != null && !prefix.isEmpty()) {
              throw new SolrException(
                  ErrorCode.BAD_REQUEST,
                  FacetParams.FACET_PREFIX + " is not supported on numeric types");
            }
            if (contains != null && !contains.isEmpty()) {
              throw new SolrException(
                  ErrorCode.BAD_REQUEST,
                  FacetParams.FACET_CONTAINS + " is not supported on numeric types");
            }
            counts =
                NumericFacets.getCounts(
                    searcher, docs, field, offset, limit, mincount, missing, sort);
          } else {
            PerSegmentSingleValuedFaceting ps =
                new PerSegmentSingleValuedFaceting(
                    searcher,
                    docs,
                    field,
                    offset,
                    limit,
                    mincount,
                    missing,
                    sort,
                    prefix,
                    contains,
                    ignoreCase);
            Executor executor = threads == 0 ? directExecutor : facetExecutor;
            ps.setNumThreads(threads);
            counts = ps.getFacetCounts(executor);
          }
          break;
        case FC:
          counts =
              DocValuesFacets.getCounts(
                  searcher,
                  docs,
                  field,
                  offset,
                  limit,
                  mincount,
                  missing,
                  sort,
                  prefix,
                  contains,
                  ignoreCase);
          break;
        default:
          throw new AssertionError();
      }
    }

    return counts;
  }
コード例 #16
0
  @SuppressWarnings("unchecked")
  private static SimpleOrderedMap<Object> getIndexedFieldsInfo(
      final SolrIndexSearcher searcher, final Set<String> fields, final int numTerms)
      throws Exception {

    IndexReader reader = searcher.getReader();
    IndexSchema schema = searcher.getSchema();

    // Walk the term enum and keep a priority queue for each map in our set
    Map<String, TopTermQueue> ttinfo = null;
    if (numTerms > 0) {
      ttinfo = getTopTerms(reader, fields, numTerms, null);
    }
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
    Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
    for (String fieldName : fieldNames) {
      if (fields != null && !fields.contains(fieldName)) {
        continue; // if a field is specified, only them
      }

      SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();

      SchemaField sfield = schema.getFieldOrNull(fieldName);
      FieldType ftype = (sfield == null) ? null : sfield.getType();

      f.add("type", (ftype == null) ? null : ftype.getTypeName());
      f.add("schema", getFieldFlags(sfield));
      if (sfield != null
          && schema.isDynamicField(sfield.getName())
          && schema.getDynamicPattern(sfield.getName()) != null) {
        f.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
      }

      // If numTerms==0, the call is just asking for a quick field list
      if (ttinfo != null && sfield != null && sfield.indexed()) {
        Query q = new TermRangeQuery(fieldName, null, null, false, false);
        TopDocs top = searcher.search(q, 1);
        if (top.totalHits > 0) {
          // Find a document with this field
          try {
            Document doc = searcher.doc(top.scoreDocs[0].doc);
            Fieldable fld = doc.getFieldable(fieldName);
            if (fld != null) {
              f.add("index", getFieldFlags(fld));
            } else {
              // it is a non-stored field...
              f.add("index", "(unstored field)");
            }
          } catch (Exception ex) {
            log.warn("error reading field: " + fieldName);
          }
        }
        f.add("docs", top.totalHits);

        TopTermQueue topTerms = ttinfo.get(fieldName);
        if (topTerms != null) {
          f.add("distinct", topTerms.distinctTerms);

          // Include top terms
          f.add("topTerms", topTerms.toNamedList(searcher.getSchema()));

          // Add a histogram
          f.add("histogram", topTerms.histogram.toNamedList());
        }
      }

      // Add the field
      finfo.add(fieldName, f);
    }
    return finfo;
  }
コード例 #17
0
  public static SolrAuthoritySetScorer createAuthoritySetScorer(
      SolrIndexSearcher searcher, Similarity similarity, String authorities, SolrIndexReader reader)
      throws IOException {
    // Get hold of solr top level searcher
    // Execute query with caching
    // translate reults to leaf docs
    // build ordered doc list

    Properties p = searcher.getSchema().getResourceLoader().getCoreProperties();
    boolean doPermissionChecks =
        Boolean.parseBoolean(p.getProperty("alfresco.doPermissionChecks", "true"));

    Query key = new SolrAuthoritySetQuery(authorities);

    DocSet answer =
        (DocSet) searcher.cacheLookup(AlfrescoSolrEventListener.ALFRESCO_AUTHORITY_CACHE, key);
    if (answer != null) {
      return new SolrAuthoritySetScorer(similarity, answer, reader);
    }

    HashSet<String> globalReaders =
        (HashSet<String>)
            searcher.cacheLookup(
                AlfrescoSolrEventListener.ALFRESCO_CACHE,
                AlfrescoSolrEventListener.KEY_GLOBAL_READERS);

    String[] auths = authorities.substring(1).split(authorities.substring(0, 1));

    boolean hasGlobalRead = false;

    for (String auth : auths) {
      if (globalReaders.contains(auth)) {
        hasGlobalRead = true;
        break;
      }
    }

    if (hasGlobalRead || (doPermissionChecks == false)) {
      // can read all
      OpenBitSet allLeafDocs =
          (OpenBitSet)
              searcher.cacheLookup(
                  AlfrescoSolrEventListener.ALFRESCO_CACHE,
                  AlfrescoSolrEventListener.KEY_ALL_LEAF_DOCS);
      return new SolrAuthoritySetScorer(similarity, new BitDocSet(allLeafDocs), reader);
    }

    DocSet readableDocSet = searcher.getDocSet(new SolrReaderSetQuery(authorities));

    if (globalReaders.contains(PermissionService.OWNER_AUTHORITY)) {
      DocSet authorityOwnedDocs = searcher.getDocSet(new SolrOwnerSetQuery(authorities));

      DocSet toCache = readableDocSet.union(authorityOwnedDocs);
      searcher.cacheInsert(AlfrescoSolrEventListener.ALFRESCO_AUTHORITY_CACHE, key, toCache);
      return new SolrAuthoritySetScorer(similarity, toCache, reader);
    } else {
      // for that docs I own that have owner Read rights
      DocSet ownerReadableDocSet =
          searcher.getDocSet(new SolrReaderSetQuery("|" + PermissionService.OWNER_AUTHORITY));
      DocSet authorityOwnedDocs = searcher.getDocSet(new SolrOwnerSetQuery(authorities));

      DocSet docsAuthorityOwnsAndCanRead = ownerReadableDocSet.intersection(authorityOwnedDocs);

      DocSet toCache = readableDocSet.union(docsAuthorityOwnsAndCanRead);
      searcher.cacheInsert(AlfrescoSolrEventListener.ALFRESCO_AUTHORITY_CACHE, key, toCache);
      return new SolrAuthoritySetScorer(similarity, toCache, reader);
    }
  }