Example #1
0
    @Override
    public SeekStatus seekCeil(BytesRef target) throws IOException {

      // already here
      if (term != null && term.equals(target)) {
        return SeekStatus.FOUND;
      }

      int startIdx = Arrays.binarySearch(indexedTermsArray, target);

      if (startIdx >= 0) {
        // we hit the term exactly... lucky us!
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = startIdx << indexIntervalBits;
        setTerm();
        assert term != null;
        return SeekStatus.FOUND;
      }

      // we didn't hit the term exactly
      startIdx = -startIdx - 1;

      if (startIdx == 0) {
        // our target occurs *before* the first term
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
        ord = 0;
        setTerm();
        assert term != null;
        return SeekStatus.NOT_FOUND;
      }

      // back up to the start of the block
      startIdx--;

      if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
        // we are already in the right block and the current term is before the term we want,
        // so we don't need to seek.
      } else {
        // seek to the right block
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = startIdx << indexIntervalBits;
        setTerm();
        assert term != null; // should be non-null since it's in the index
      }

      while (term != null && term.compareTo(target) < 0) {
        next();
      }

      if (term == null) {
        return SeekStatus.END;
      } else if (term.compareTo(target) == 0) {
        return SeekStatus.FOUND;
      } else {
        return SeekStatus.NOT_FOUND;
      }
    }
Example #2
0
      IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
        super();
        // if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
        this.fst = dict;
        this.fstReader = fst.getBytesReader();
        this.fstOutputs = dict.outputs;
        this.fsa = compiled.runAutomaton;
        this.level = -1;
        this.stack = new Frame[16];
        for (int i = 0; i < stack.length; i++) {
          this.stack[i] = new Frame();
        }

        Frame frame;
        frame = loadVirtualFrame(newFrame());
        this.level++;
        frame = loadFirstFrame(newFrame());
        pushFrame(frame);

        this.meta = null;
        this.metaUpto = 1;
        this.decoded = false;
        this.pending = false;

        if (startTerm == null) {
          pending = isAccept(topFrame());
        } else {
          doSeekCeil(startTerm);
          pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame());
        }
      }
 @Override
 public boolean equals(Object obj) {
   if (this == obj) return true;
   if (!super.equals(obj)) return false;
   if (getClass() != obj.getClass()) return false;
   TermRangeQuery other = (TermRangeQuery) obj;
   if (includeLower != other.includeLower) return false;
   if (includeUpper != other.includeUpper) return false;
   if (lowerTerm == null) {
     if (other.lowerTerm != null) return false;
   } else if (!lowerTerm.equals(other.lowerTerm)) return false;
   if (upperTerm == null) {
     if (other.upperTerm != null) return false;
   } else if (!upperTerm.equals(other.upperTerm)) return false;
   return true;
 }
Example #4
0
 @Override
 public void seekExact(BytesRef target, TermState otherState) {
   if (!target.equals(term)) {
     state.copyFrom(otherState);
     term = BytesRef.deepCopyOf(target);
     seekPending = true;
   }
 }
 /**
  * Returns <code>true</code> iff the length and the checksums are the same. otherwise <code>false
  * </code>
  */
 public boolean isSame(StoreFileMetaData other) {
   if (checksum == null || other.checksum == null) {
     // we can't tell if either or is null so we return false in this case! this is why we don't
     // use equals for this!
     return false;
   }
   return length == other.length && checksum.equals(other.checksum) && hash.equals(other.hash);
 }
 @Override
 public boolean equals(Object obj) {
   if (sameClassAs(obj) == false) {
     return false;
   }
   TypeQuery that = (TypeQuery) obj;
   return type.equals(that.type);
 }
  @Override
  public boolean equals(Object obj) {
    if (obj == this) return true;

    if (obj instanceof Token) {
      final Token other = (Token) obj;
      return (flags == other.flags
          && (payload == null ? other.payload == null : payload.equals(other.payload))
          && super.equals(obj));
    } else return false;
  }
 @Override
 public boolean equals(Object obj) {
   if (this == obj) return true;
   if (obj == null) return false;
   if (getClass() != obj.getClass()) return false;
   Term other = (Term) obj;
   if (field == null) {
     if (other.field != null) return false;
   } else if (!field.equals(other.field)) return false;
   if (bytes == null) {
     if (other.bytes != null) return false;
   } else if (!bytes.equals(other.bytes)) return false;
   return true;
 }
 @Override
 public void finishTerm(BytesRef text, TermStats stats) throws IOException {
   assert state == TermsConsumerState.START;
   state = TermsConsumerState.INITIAL;
   assert text.equals(lastTerm);
   assert stats.docFreq > 0; // otherwise, this method should not be called.
   assert stats.docFreq == lastPostingsConsumer.docFreq;
   sumDocFreq += stats.docFreq;
   if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
     assert stats.totalTermFreq == -1;
   } else {
     assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
     sumTotalTermFreq += stats.totalTermFreq;
   }
   in.finishTerm(text, stats);
 }
 @Override
 public String next() throws IOException {
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END)) {
       current = null;
       return null;
     }
     if (StringHelper.startsWith(scratch, FIELD)) {
       return current =
           new String(
               scratch.bytes,
               scratch.offset + FIELD.length,
               scratch.length - FIELD.length,
               "UTF-8");
     }
   }
 }
  @Override
  public void build(TermFreqIterator iterator) throws IOException {
    BytesRef scratch = new BytesRef();
    TermFreqIterator iter =
        new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator());
    IntsRef scratchInts = new IntsRef();
    BytesRef previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
      long cost = iter.weight();

      if (previous == null) {
        previous = new BytesRef();
      } else if (scratch.equals(previous)) {
        continue; // for duplicate suggestions, the best weight is actually
        // added
      }
      Util.toIntsRef(scratch, scratchInts);
      builder.add(scratchInts, cost);
      previous.copyBytes(scratch);
    }
    fst = builder.finish();
  }
 private void loadTerms() throws IOException {
   PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
   final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b;
   final PairOutputs<Long, Long> outputsInner =
       new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs);
   final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
       new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner);
   b =
       new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>(
           FST.INPUT_TYPE.BYTE1, outputs);
   IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
   in.seek(termsStart);
   final BytesRef lastTerm = new BytesRef(10);
   long lastDocsStart = -1;
   int docFreq = 0;
   long totalTermFreq = 0;
   OpenBitSet visitedDocs = new OpenBitSet();
   final IntsRef scratchIntsRef = new IntsRef();
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
         sumTotalTermFreq += totalTermFreq;
       }
       break;
     } else if (StringHelper.startsWith(scratch, DOC)) {
       docFreq++;
       sumDocFreq++;
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       visitedDocs.set(docID);
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, TERM)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
       }
       lastDocsStart = in.getFilePointer();
       final int len = scratch.length - TERM.length;
       if (len > lastTerm.length) {
         lastTerm.grow(len);
       }
       System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
       lastTerm.length = len;
       docFreq = 0;
       sumTotalTermFreq += totalTermFreq;
       totalTermFreq = 0;
       termCount++;
     }
   }
   docCount = (int) visitedDocs.cardinality();
   fst = b.finish();
   /*
   PrintStream ps = new PrintStream("out.dot");
   fst.toDot(ps);
   ps.close();
   System.out.println("SAVED out.dot");
   */
   // System.out.println("FST " + fst.sizeInBytes());
 }
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false)) return;

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0) return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
      limit = Integer.MAX_VALUE;
    }

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort =
        !TermsParams.TERMS_SORT_INDEX.equals(
            params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
      freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
      NamedList<Integer> fieldTerms = new NamedList<>();
      termsResult.add(field, fieldTerms);

      Terms terms = lfields == null ? null : lfields.terms(field);
      if (terms == null) {
        // no terms for this field
        continue;
      }

      FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
      if (ft == null) ft = new StrField();

      // prefix must currently be text
      BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

      BytesRef upperBytes = null;
      if (upperStr != null) {
        upperBytes = new BytesRef();
        ft.readableToIndexed(upperStr, upperBytes);
      }

      BytesRef lowerBytes;
      if (lowerStr == null) {
        // If no lower bound was specified, use the prefix
        lowerBytes = prefixBytes;
      } else {
        lowerBytes = new BytesRef();
        if (raw) {
          // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
          // perhaps we detect if the FieldType is non-character and expect hex if so?
          lowerBytes = new BytesRef(lowerStr);
        } else {
          lowerBytes = new BytesRef();
          ft.readableToIndexed(lowerStr, lowerBytes);
        }
      }

      TermsEnum termsEnum = terms.iterator(null);
      BytesRef term = null;

      if (lowerBytes != null) {
        if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerIncl == false && term.equals(lowerBytes)) {
            term = termsEnum.next();
          }
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }

      int i = 0;
      BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
          (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
      CharsRef external = new CharsRef();
      while (term != null && (i < limit || sort)) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // stop if the prefix doesn't match
        if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break;

        if (pattern != null) {
          // indexed text or external text?
          // TODO: support "raw" mode?
          ft.indexedToReadable(term, external);
          externalized = true;
          if (!pattern.matcher(external).matches()) {
            term = termsEnum.next();
            continue;
          }
        }

        if (upperBytes != null) {
          int upperCmp = term.compareTo(upperBytes);
          // if we are past the upper term, or equal to it (when don't include upper) then stop.
          if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;
        }

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        if (docFreq >= freqmin && docFreq <= freqmax) {
          // add the term to the list
          if (sort) {
            queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
          } else {

            // TODO: handle raw somehow
            if (!externalized) {
              ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
            i++;
          }
        }

        term = termsEnum.next();
      }

      if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
          if (i >= limit) break;
          ft.indexedToReadable(item.key, external);
          fieldTerms.add(external.toString(), item.val);
          i++;
        }
      }
    }
  }
 @Override
 public boolean equalsSameType(Object other) {
   MutableValueStr b = (MutableValueStr) other;
   return value.equals(b.value) && exists == b.exists;
 }
  @Override
  public Facet facet() {
    if (current != null) {
      missing += current.counts[0];
      total += current.total - current.counts[0];
      // if we have values for this one, add it
      if (current.values.ordinals().getNumOrds() > 1) {
        aggregators.add(current);
      }
    }

    AggregatorPriorityQueue queue = new AggregatorPriorityQueue(aggregators.size());

    for (ReaderAggregator aggregator : aggregators) {
      if (aggregator.nextPosition()) {
        queue.add(aggregator);
      }
    }

    // YACK, we repeat the same logic, but once with an optimizer priority queue for smaller sizes
    if (size < EntryPriorityQueue.LIMIT) {
      // optimize to use priority size
      EntryPriorityQueue ordered = new EntryPriorityQueue(size, comparatorType.comparator());

      while (queue.size() > 0) {
        ReaderAggregator agg = queue.top();
        BytesRef value =
            agg.values.makeSafe(
                agg.current); // we need to makeSafe it, since we end up pushing it... (can we get
        // around this?)
        int count = 0;
        do {
          count += agg.counts[agg.position];
          if (agg.nextPosition()) {
            agg = queue.updateTop();
          } else {
            // we are done with this reader
            queue.pop();
            agg = queue.top();
          }
        } while (agg != null && value.equals(agg.current));

        if (count > minCount) {
          if (excluded != null && excluded.contains(value)) {
            continue;
          }
          // LUCENE 4 UPGRADE: use Lucene's RegexCapabilities
          if (matcher != null && !matcher.reset(value.utf8ToString()).matches()) {
            continue;
          }
          InternalStringTermsFacet.TermEntry entry =
              new InternalStringTermsFacet.TermEntry(value, count);
          ordered.insertWithOverflow(entry);
        }
      }
      InternalStringTermsFacet.TermEntry[] list =
          new InternalStringTermsFacet.TermEntry[ordered.size()];
      for (int i = ordered.size() - 1; i >= 0; i--) {
        list[i] = (InternalStringTermsFacet.TermEntry) ordered.pop();
      }

      for (ReaderAggregator aggregator : aggregators) {
        CacheRecycler.pushIntArray(aggregator.counts);
      }

      return new InternalStringTermsFacet(
          facetName, comparatorType, size, Arrays.asList(list), missing, total);
    }

    BoundedTreeSet<InternalStringTermsFacet.TermEntry> ordered =
        new BoundedTreeSet<InternalStringTermsFacet.TermEntry>(comparatorType.comparator(), size);

    while (queue.size() > 0) {
      ReaderAggregator agg = queue.top();
      BytesRef value =
          agg.values.makeSafe(
              agg.current); // we need to makeSafe it, since we end up pushing it... (can we work
      // around that?)
      int count = 0;
      do {
        count += agg.counts[agg.position];
        if (agg.nextPosition()) {
          agg = queue.updateTop();
        } else {
          // we are done with this reader
          queue.pop();
          agg = queue.top();
        }
      } while (agg != null && value.equals(agg.current));

      if (count > minCount) {
        if (excluded != null && excluded.contains(value)) {
          continue;
        }
        // LUCENE 4 UPGRADE: use Lucene's RegexCapabilities
        if (matcher != null && !matcher.reset(value.utf8ToString()).matches()) {
          continue;
        }
        InternalStringTermsFacet.TermEntry entry =
            new InternalStringTermsFacet.TermEntry(value, count);
        ordered.add(entry);
      }
    }

    for (ReaderAggregator aggregator : aggregators) {
      CacheRecycler.pushIntArray(aggregator.counts);
    }

    return new InternalStringTermsFacet(facetName, comparatorType, size, ordered, missing, total);
  }