void fillBucket(SimpleOrderedMap<Object> bucket, Query q, DocSet result) throws IOException {
    boolean needDocSet = freq.getFacetStats().size() > 0 || freq.getSubFacets().size() > 0;

    // TODO: always collect counts or not???

    int count;

    if (result != null) {
      count = result.size();
    } else if (needDocSet) {
      if (q == null) {
        result = fcontext.base;
        // result.incref(); // OFF-HEAP
      } else {
        result = fcontext.searcher.getDocSet(q, fcontext.base);
      }
      count = result.size();
    } else {
      if (q == null) {
        count = fcontext.base.size();
      } else {
        count = fcontext.searcher.numDocs(q, fcontext.base);
      }
    }

    try {
      processStats(bucket, result, count);
      processSubs(bucket, q, result);
    } finally {
      if (result != null) {
        // result.decref(); // OFF-HEAP
        result = null;
      }
    }
  }
  void processSubs(SimpleOrderedMap<Object> response, Query filter, DocSet domain)
      throws IOException {

    // TODO: what if a zero bucket has a sub-facet with an exclusion that would yield results?
    // should we check for domain-altering exclusions, or even ask the sub-facet for
    // it's domain and then only skip it if it's 0?

    if (domain == null || domain.size() == 0 && !freq.processEmpty) {
      return;
    }

    for (Map.Entry<String, FacetRequest> sub : freq.getSubFacets().entrySet()) {
      // make a new context for each sub-facet since they can change the domain
      FacetContext subContext = fcontext.sub(filter, domain);
      FacetProcessor subProcessor = sub.getValue().createFacetProcessor(subContext);
      if (fcontext.getDebugInfo()
          != null) { // if fcontext.debugInfo != null, it means rb.debug() == true
        FacetDebugInfo fdebug = new FacetDebugInfo();
        subContext.setDebugInfo(fdebug);
        fcontext.getDebugInfo().addChild(fdebug);

        fdebug.setReqDescription(sub.getValue().getFacetDescription());
        fdebug.setProcessor(subProcessor.getClass().getSimpleName());
        if (subContext.filter != null) fdebug.setFilter(subContext.filter.toString());

        final RTimer timer = new RTimer();
        subProcessor.process();
        long timeElapsed = (long) timer.getTime();
        fdebug.setElapse(timeElapsed);
        fdebug.putInfoItem("domainSize", (long) subContext.base.size());
      } else {
        subProcessor.process();
      }

      response.add(sub.getKey(), subProcessor.getResponse());
    }
  }
  /**
   * Returns a list of terms in the specified field along with the corresponding count of documents
   * in the set that match that constraint. This method uses the FilterCache to get the intersection
   * count between <code>docs</code> and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
  public NamedList<Integer> getFacetTermEnumCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String field,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase,
      SolrParams params)
      throws IOException {

    /* :TODO: potential optimization...
     * cache the Terms with the highest docFreq and try them first
     * don't enum if we get our max from them
     */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
      SortedIntDocSet sset = (SortedIntDocSet) docs;
      fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }

    IndexSchema schema = searcher.getSchema();
    LeafReader r = searcher.getLeafReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
        sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef prefixTermBytes = null;
    if (prefix != null) {
      String indexedPrefix = ft.toInternal(prefix);
      prefixTermBytes = new BytesRef(indexedPrefix);
    }

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (prefixTermBytes != null) {
        if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }
    }

    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();

    if (docs.size() >= mincount) {
      while (term != null) {

        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;

        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
          int df = termsEnum.docFreq();

          // If we are sorting, we can use df>min (rather than >=) since we
          // are going in index order.  For certain term distributions this can
          // make a large difference (for example, many terms with df=1).
          if (df > 0 && df > min) {
            int c;

            if (df >= minDfFilterCache) {
              // use the filter cache

              if (deState == null) {
                deState = new SolrIndexSearcher.DocsEnumState();
                deState.fieldName = field;
                deState.liveDocs = r.getLiveDocs();
                deState.termsEnum = termsEnum;
                deState.postingsEnum = postingsEnum;
              }

              c = searcher.numDocs(docs, deState);

              postingsEnum = deState.postingsEnum;
            } else {
              // iterate over TermDocs to calculate the intersection

              // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it
              // matter for this?
              // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
              // impl)
              // TODO: would passing deleted docs lead to better efficiency over checking the
              // fastForRandomSet?
              postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
              c = 0;

              if (postingsEnum instanceof MultiPostingsEnum) {
                MultiPostingsEnum.EnumWithSlice[] subs =
                    ((MultiPostingsEnum) postingsEnum).getSubs();
                int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                for (int subindex = 0; subindex < numSubs; subindex++) {
                  MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                  if (sub.postingsEnum == null) continue;
                  int base = sub.slice.start;
                  int docid;
                  while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    if (fastForRandomSet.exists(docid + base)) c++;
                  }
                }
              } else {
                int docid;
                while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.exists(docid)) c++;
                }
              }
            }

            if (sortByCount) {
              if (c > min) {
                BytesRef termCopy = BytesRef.deepCopyOf(term);
                queue.add(new CountPair<>(termCopy, c));
                if (queue.size() >= maxsize) min = queue.last().val;
              }
            } else {
              if (c >= mincount && --off < 0) {
                if (--lim < 0) break;
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
              }
            }
          }
        }
        term = termsEnum.next();
      }
    }

    if (sortByCount) {
      for (CountPair<BytesRef, Integer> p : queue) {
        if (--off >= 0) continue;
        if (--lim < 0) break;
        ft.indexedToReadable(p.key, charsRef);
        res.add(charsRef.toString(), p.val);
      }
    }

    if (missing) {
      res.add(null, getFieldMissingCount(searcher, docs, field));
    }

    return res;
  }
Beispiel #4
0
  public NamedList get(String[] fields, DocSet baseDocs) throws IOException, ParseException {
    if (this.crcget == null) {
      this.container =
          this.parse.createContainer(fields, baseDocs, this.reader, this.searcher, this.req);
      DocIterator iter = baseDocs.iterator();
      this.recordCount.inc(baseDocs.size());
      Doclist res = new Doclist(this.parse.limit_offset);
      int doc = -1;
      while (iter.hasNext()) {
        doc = iter.nextDoc();
        res.add(doc);
        if (res.index >= this.parse.limit_offset) {
          break;
        }
      }
      PriorityQueue<SelectDetailRow> topItems = this.transGroupValue(res, fields);
      this.container.free();
      return this.toNameList(topItems);
    }
    String hostkey = String.valueOf(this.getkeyCrc());

    ConcurrentHashMap<Long, String> cache =
        MdrillUtils.CRC_CACHE_SIZE.remove(crcget + "@" + hostkey);

    NamedList rtn = new NamedList();
    Map<Long, String> crcvalue = new HashMap<Long, String>();
    rtn.add("fdtcre", crcvalue);
    if (cache != null) {
      MapFieldSelector selector = new MapFieldSelector(fields);
      FieldType[] ftlist = new FieldType[fields.length];
      IndexSchema schema = this.searcher.getSchema();

      for (int j = 0; j < fields.length; j++) {
        ftlist[j] = schema.getFieldType(fields[j]);
      }

      String crcliststr = params.get("mdrill.crc.key.get.crclist");
      if (crcliststr != null) {
        String[] crclist = crcliststr.split(",");
        for (String s : crclist) {
          Long crc = Long.parseLong(s);
          String v = cache.get(crc);
          if (v != null) {
            String cols[] = v.split(UniqConfig.GroupJoinString(), -1);
            if (cols.length >= 2) {
              int doc = Integer.parseInt(cols[0]);

              SortGroupVal buff = new SortGroupVal();
              buff.groupbuff.append("-");
              buff.groupbuff.append(UniqConfig.GroupJoinString());
              buff.groupbuff.append("-");
              Document docfields = this.reader.document(doc, selector);
              if (docfields == null) {
                for (int j = 0; j < fields.length; j++) {
                  buff.groupbuff.append(UniqConfig.GroupJoinString());
                  buff.groupbuff.append(EncodeUtils.encode("-"));
                }
                if (!crcvalue.containsKey(crc)) {
                  crcvalue.put(crc, buff.groupbuff.toString());
                }
              } else {

                for (int j = 0; j < fields.length; j++) {
                  buff.groupbuff.append(UniqConfig.GroupJoinString());

                  Fieldable fv = docfields.getFieldable(fields[j]);

                  if (fv != null) {
                    buff.groupbuff.append(ftlist[j].toExternal(fv));
                  } else {
                    buff.groupbuff.append(EncodeUtils.encode("-"));
                  }
                }
                crcvalue.put(crc, buff.groupbuff.toString());
              }
            }
          }
        }
      }
    }
    return rtn;
  }
  public static NamedList<Integer> getCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String fieldName,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix)
      throws IOException {
    SchemaField schemaField = searcher.getSchema().getField(fieldName);
    FieldType ft = schemaField.getType();
    NamedList<Integer> res = new NamedList<Integer>();

    final SortedSetDocValues si; // for term lookups only
    OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
    if (schemaField.multiValued()) {
      si = searcher.getAtomicReader().getSortedSetDocValues(fieldName);
      if (si instanceof MultiSortedSetDocValues) {
        ordinalMap = ((MultiSortedSetDocValues) si).mapping;
      }
    } else {
      SortedDocValues single = searcher.getAtomicReader().getSortedDocValues(fieldName);
      si = single == null ? null : new SingletonSortedSetDocValues(single);
      if (single instanceof MultiSortedDocValues) {
        ordinalMap = ((MultiSortedDocValues) single).mapping;
      }
    }
    if (si == null) {
      return finalize(res, searcher, schemaField, docs, -1, missing);
    }
    if (si.getValueCount() >= Integer.MAX_VALUE) {
      throw new UnsupportedOperationException(
          "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms");
    }

    final BytesRef br = new BytesRef();

    final BytesRef prefixRef;
    if (prefix == null) {
      prefixRef = null;
    } else if (prefix.length() == 0) {
      prefix = null;
      prefixRef = null;
    } else {
      prefixRef = new BytesRef(prefix);
    }

    int startTermIndex, endTermIndex;
    if (prefix != null) {
      startTermIndex = (int) si.lookupTerm(prefixRef);
      if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
      prefixRef.append(UnicodeUtil.BIG_TERM);
      endTermIndex = (int) si.lookupTerm(prefixRef);
      assert endTermIndex < 0;
      endTermIndex = -endTermIndex - 1;
    } else {
      startTermIndex = -1;
      endTermIndex = (int) si.getValueCount();
    }

    final int nTerms = endTermIndex - startTermIndex;
    int missingCount = -1;
    final CharsRef charsRef = new CharsRef(10);
    if (nTerms > 0 && docs.size() >= mincount) {

      // count collection array only needs to be as big as the number of terms we are
      // going to collect counts for.
      final int[] counts = new int[nTerms];

      Filter filter = docs.getTopFilter();
      List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
      for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
        AtomicReaderContext leaf = leaves.get(subIndex);
        DocIdSet dis =
            filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
        DocIdSetIterator disi = null;
        if (dis != null) {
          disi = dis.iterator();
        }
        if (disi != null) {
          if (schemaField.multiValued()) {
            SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
            if (sub == null) {
              sub = SortedSetDocValues.EMPTY;
            }
            if (sub instanceof SingletonSortedSetDocValues) {
              // some codecs may optimize SORTED_SET storage for single-valued fields
              final SortedDocValues values =
                  ((SingletonSortedSetDocValues) sub).getSortedDocValues();
              accumSingle(counts, startTermIndex, values, disi, subIndex, ordinalMap);
            } else {
              accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
            }
          } else {
            SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
            if (sub == null) {
              sub = SortedDocValues.EMPTY;
            }
            accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
          }
        }
      }

      if (startTermIndex == -1) {
        missingCount = counts[0];
      }

      // IDEA: we could also maintain a count of "other"... everything that fell outside
      // of the top 'N'

      int off = offset;
      int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

      if (sort.equals(FacetParams.FACET_SORT_COUNT)
          || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
        int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
        maxsize = Math.min(maxsize, nTerms);
        LongPriorityQueue queue =
            new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE);

        int min = mincount - 1; // the smallest value in the top 'N' values
        for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
          int c = counts[i];
          if (c > min) {
            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).

            // smaller term numbers sort higher, so subtract the term number instead
            long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i);
            boolean displaced = queue.insert(pair);
            if (displaced) min = (int) (queue.top() >>> 32);
          }
        }

        // if we are deep paging, we don't have to order the highest "offset" counts.
        int collectCount = Math.max(0, queue.size() - off);
        assert collectCount <= lim;

        // the start and end indexes of our list "sorted" (starting with the highest value)
        int sortedIdxStart = queue.size() - (collectCount - 1);
        int sortedIdxEnd = queue.size() + 1;
        final long[] sorted = queue.sort(collectCount);

        for (int i = sortedIdxStart; i < sortedIdxEnd; i++) {
          long pair = sorted[i];
          int c = (int) (pair >>> 32);
          int tnum = Integer.MAX_VALUE - (int) pair;
          si.lookupOrd(startTermIndex + tnum, br);
          ft.indexedToReadable(br, charsRef);
          res.add(charsRef.toString(), c);
        }

      } else {
        // add results in index order
        int i = (startTermIndex == -1) ? 1 : 0;
        if (mincount <= 0) {
          // if mincount<=0, then we won't discard any terms and we know exactly
          // where to start.
          i += off;
          off = 0;
        }

        for (; i < nTerms; i++) {
          int c = counts[i];
          if (c < mincount || --off >= 0) continue;
          if (--lim < 0) break;
          si.lookupOrd(startTermIndex + i, br);
          ft.indexedToReadable(br, charsRef);
          res.add(charsRef.toString(), c);
        }
      }
    }

    return finalize(res, searcher, schemaField, docs, missingCount, missing);
  }