// we don't actually write a .fdx-like index, instead we read the
 // stored fields file in entirety up-front and save the offsets
 // so we can seek to the documents later.
 private void readIndex(int size) throws IOException {
   ChecksumIndexInput input = new BufferedChecksumIndexInput(in);
   offsets = new long[size];
   int upto = 0;
   while (!scratch.get().equals(END)) {
     SimpleTextUtil.readLine(input, scratch);
     if (StringHelper.startsWith(scratch.get(), DOC)) {
       offsets[upto] = input.getFilePointer();
       upto++;
     }
   }
   SimpleTextUtil.checkFooter(input);
   assert upto == offsets.length;
 }
 private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor)
     throws IOException {
   readLine();
   assert StringHelper.startsWith(scratch.get(), VALUE);
   if (type == TYPE_STRING) {
     byte[] bytes = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
     visitor.stringField(fieldInfo, bytes);
   } else if (type == TYPE_BINARY) {
     byte[] copy = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);
     visitor.binaryField(fieldInfo, copy);
   } else if (type == TYPE_INT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.intField(fieldInfo, Integer.parseInt(scratchUTF16.toString()));
   } else if (type == TYPE_LONG) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.longField(fieldInfo, Long.parseLong(scratchUTF16.toString()));
   } else if (type == TYPE_FLOAT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.floatField(fieldInfo, Float.parseFloat(scratchUTF16.toString()));
   } else if (type == TYPE_DOUBLE) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.doubleField(fieldInfo, Double.parseDouble(scratchUTF16.toString()));
   }
 }
Example #3
0
 @Override
 public BytesRef next() throws IOException {
   boolean success = false;
   if (done) {
     return null;
   }
   try {
     ByteArrayDataInput input = new ByteArrayDataInput();
     if (reader.read(scratch)) {
       final BytesRef bytes = scratch.get();
       weight = decode(bytes, input);
       if (hasPayloads) {
         payload = decodePayload(bytes, input);
       }
       if (hasContexts) {
         contexts = decodeContexts(bytes, input);
       }
       success = true;
       return bytes;
     }
     close();
     success = done = true;
     return null;
   } finally {
     if (!success) {
       done = true;
       close();
     }
   }
 }
 @Override
 public BytesRef indexedValueForSearch(Object value) {
   long longValue = NumericUtils.doubleToSortableLong(parseDoubleValue(value));
   BytesRefBuilder bytesRef = new BytesRefBuilder();
   NumericUtils.longToPrefixCoded(longValue, 0, bytesRef); // 0 because of exact match
   return bytesRef.get();
 }
Example #5
0
      /* Decodes only the term bytes of the next term.  If caller then asks for
      metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
      decode all metadata up to the current term. */
      private BytesRef _next() throws IOException {
        // System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" +
        // state.termBlockOrd + " (vs " + blockTermCount + ")");
        if (state.termBlockOrd == blockTermCount && !nextBlock()) {
          // System.out.println("  eof");
          indexIsCurrent = false;
          return null;
        }

        // TODO: cutover to something better for these ints!  simple64?
        final int suffix = termSuffixesReader.readVInt();
        // System.out.println("  suffix=" + suffix);

        term.setLength(termBlockPrefix + suffix);
        term.grow(term.length());
        termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
        state.termBlockOrd++;

        // NOTE: meaningless in the non-ord case
        state.ord++;

        // System.out.println("  return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " +
        // term + " tbOrd=" + state.termBlockOrd);
        return term.get();
      }
 @Override
 public BytesRef indexedValueForSearch(Object value) {
   int intValue = NumericUtils.floatToSortableInt(parseValue(value));
   BytesRefBuilder bytesRef = new BytesRefBuilder();
   NumericUtils.intToPrefixCoded(intValue, 0, bytesRef); // 0 because of exact match
   return bytesRef.get();
 }
 @Override
 public BytesRef indexedValueForSearch(Object value) {
   BytesRefBuilder bytesRef = new BytesRefBuilder();
   LegacyNumericUtils.intToPrefixCoded(
       parseValue(value), 0, bytesRef); // 0 because of exact match
   return bytesRef.get();
 }
 public void testRandomReads() throws IOException {
   int length = randomIntBetween(10, scaledRandomIntBetween(PAGE_SIZE * 2, PAGE_SIZE * 20));
   BytesReference pbr = newBytesReference(length);
   StreamInput streamInput = pbr.streamInput();
   BytesRefBuilder target = new BytesRefBuilder();
   while (target.length() < pbr.length()) {
     switch (randomIntBetween(0, 10)) {
       case 6:
       case 5:
         target.append(new BytesRef(new byte[] {streamInput.readByte()}));
         break;
       case 4:
       case 3:
         BytesRef bytesRef =
             streamInput.readBytesRef(scaledRandomIntBetween(1, pbr.length() - target.length()));
         target.append(bytesRef);
         break;
       default:
         byte[] buffer = new byte[scaledRandomIntBetween(1, pbr.length() - target.length())];
         int offset = scaledRandomIntBetween(0, buffer.length - 1);
         int read = streamInput.read(buffer, offset, buffer.length - offset);
         target.append(new BytesRef(buffer, offset, read));
         break;
     }
   }
   assertEquals(pbr.length(), target.length());
   BytesRef targetBytes = target.get();
   assertArrayEquals(
       pbr.toBytes(),
       Arrays.copyOfRange(targetBytes.bytes, targetBytes.offset, targetBytes.length));
 }
  @Override
  public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException {
    in.seek(offsets[n]);

    while (true) {
      readLine();
      if (StringHelper.startsWith(scratch.get(), FIELD) == false) {
        break;
      }
      int fieldNumber = parseIntAt(FIELD.length);
      FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
      readLine();
      assert StringHelper.startsWith(scratch.get(), NAME);
      readLine();
      assert StringHelper.startsWith(scratch.get(), TYPE);

      final BytesRef type;
      if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) {
        type = TYPE_STRING;
      } else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) {
        type = TYPE_BINARY;
      } else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) {
        type = TYPE_INT;
      } else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) {
        type = TYPE_LONG;
      } else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) {
        type = TYPE_FLOAT;
      } else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) {
        type = TYPE_DOUBLE;
      } else {
        throw new RuntimeException("unknown field type");
      }

      switch (visitor.needsField(fieldInfo)) {
        case YES:
          readField(type, fieldInfo, visitor);
          break;
        case NO:
          readLine();
          assert StringHelper.startsWith(scratch.get(), VALUE);
          break;
        case STOP:
          return;
      }
    }
  }
 @Override
 public BytesRef next() throws IOException {
   if (++curPos < entries.size()) {
     entries.get(spare, curPos);
     return spare.get();
   }
   return null;
 }
Example #11
0
 /** Unmarshals a string-based field value. */
 protected static Object unmarshalStringSortValue(Object value) {
   if (null == value) {
     return null;
   }
   BytesRefBuilder spare = new BytesRefBuilder();
   String stringVal = (String) value;
   spare.copyChars(stringVal);
   return spare.get();
 }
 @Override
 public BytesRef getBytesRef() {
   assert valueSize == 64 || valueSize == 32;
   if (valueSize == 64) {
     NumericUtils.longToPrefixCoded(value, shift, bytes);
   } else {
     NumericUtils.intToPrefixCoded((int) value, shift, bytes);
   }
   return bytes.get();
 }
    @Override
    protected void doSetNextReader(LeafReaderContext context) throws IOException {
      if (segmentFacetCounts != null) {
        segmentResults.add(createSegmentResult());
      }

      groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField);
      facetFieldTermsIndex = DocValues.getSorted(context.reader(), facetField);

      // 1+ to allow for the -1 "not set":
      segmentFacetCounts = new int[facetFieldTermsIndex.getValueCount() + 1];
      segmentTotalCount = 0;

      segmentGroupedFacetHits.clear();
      for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
        int facetOrd =
            groupedFacetHit.facetValue == null
                ? -1
                : facetFieldTermsIndex.lookupTerm(groupedFacetHit.facetValue);
        if (groupedFacetHit.facetValue != null && facetOrd < 0) {
          continue;
        }

        int groupOrd =
            groupedFacetHit.groupValue == null
                ? -1
                : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue);
        if (groupedFacetHit.groupValue != null && groupOrd < 0) {
          continue;
        }

        int segmentGroupedFacetsIndex =
            groupOrd * (facetFieldTermsIndex.getValueCount() + 1) + facetOrd;
        segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
      }

      if (facetPrefix != null) {
        startFacetOrd = facetFieldTermsIndex.lookupTerm(facetPrefix);
        if (startFacetOrd < 0) {
          // Points to the ord one higher than facetPrefix
          startFacetOrd = -startFacetOrd - 1;
        }
        BytesRefBuilder facetEndPrefix = new BytesRefBuilder();
        facetEndPrefix.append(facetPrefix);
        facetEndPrefix.append(UnicodeUtil.BIG_TERM);
        endFacetOrd = facetFieldTermsIndex.lookupTerm(facetEndPrefix.get());
        assert endFacetOrd < 0;
        endFacetOrd = -endFacetOrd - 1; // Points to the ord one higher than facetEndPrefix
      } else {
        startFacetOrd = -1;
        endFacetOrd = facetFieldTermsIndex.getValueCount();
      }
    }
 @Override
 public void seekExact(long ord) throws IOException {
   // TODO: would be better to make this simpler and faster.
   // but we dont want to introduce a bug that corrupts our enum state!
   bytesReader.setPosition(0);
   fst.getFirstArc(firstArc);
   IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts);
   BytesRefBuilder scratchBytes = new BytesRefBuilder();
   scratchBytes.clear();
   Util.toBytesRef(output, scratchBytes);
   // TODO: we could do this lazily, better to try to push into FSTEnum though?
   in.seekExact(scratchBytes.get());
 }
    // NOTE: while it's tempting to make this public, since
    // caller's parser likely knows the
    // numInput/numOutputWords, sneaky exceptions, much later
    // on, will result if these values are wrong; so we always
    // recompute ourselves to be safe:
    private void add(
        CharsRef input,
        int numInputWords,
        CharsRef output,
        int numOutputWords,
        boolean includeOrig) {
      // first convert to UTF-8
      if (numInputWords <= 0) {
        throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
      }
      if (input.length <= 0) {
        throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
      }
      if (numOutputWords <= 0) {
        throw new IllegalArgumentException(
            "numOutputWords must be > 0 (got " + numOutputWords + ")");
      }
      if (output.length <= 0) {
        throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
      }

      assert !hasHoles(input) : "input has holes: " + input;
      assert !hasHoles(output) : "output has holes: " + output;

      // System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + "
      // output=" + output + " numOutputWords=" + numOutputWords);
      utf8Scratch.copyChars(output.chars, output.offset, output.length);
      // lookup in hash
      int ord = words.add(utf8Scratch.get());
      if (ord < 0) {
        // already exists in our hash
        ord = (-ord) - 1;
        // System.out.println("  output=" + output + " old ord=" + ord);
      } else {
        // System.out.println("  output=" + output + " new ord=" + ord);
      }

      MapEntry e = workingSet.get(input);
      if (e == null) {
        e = new MapEntry();
        workingSet.put(
            CharsRef.deepCopyOf(input), e); // make a copy, since we will keep around in our map
      }

      e.ords.add(ord);
      e.includeOrig |= includeOrig;
      maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
      maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
    }
 public static int sortAndDedup(final BytesRefArray bytes, final int[] indices) {
   final BytesRefBuilder scratch = new BytesRefBuilder();
   final BytesRefBuilder scratch1 = new BytesRefBuilder();
   final int numValues = bytes.size();
   assert indices.length >= numValues;
   if (numValues <= 1) {
     return numValues;
   }
   sort(scratch, scratch1, bytes, indices);
   int uniqueCount = 1;
   BytesRefBuilder previous = scratch;
   BytesRefBuilder current = scratch1;
   bytes.get(previous, indices[0]);
   for (int i = 1; i < numValues; ++i) {
     bytes.get(current, indices[i]);
     if (!previous.get().equals(current.get())) {
       indices[uniqueCount++] = indices[i];
     }
     BytesRefBuilder tmp = previous;
     previous = current;
     current = tmp;
   }
   return uniqueCount;
 }
 @Override
 public void seekExact(BytesRef target, TermState otherState) {
   // if (DEBUG) {
   //   System.out.println("BTTR.seekExact termState seg=" + segment + " target=" +
   // target.utf8ToString() + " " + target + " state=" + otherState);
   // }
   assert clearEOF();
   if (target.compareTo(term.get()) != 0 || !termExists) {
     assert otherState != null && otherState instanceof BlockTermState;
     currentFrame = staticFrame;
     currentFrame.state.copyFrom(otherState);
     term.copyBytes(target);
     currentFrame.metaDataUpto = currentFrame.getTermBlockOrd();
     assert currentFrame.metaDataUpto > 0;
     validIndexPrefix = 0;
   } else {
     // if (DEBUG) {
     //   System.out.println("  skip seek: already on target state=" + currentFrame.state);
     // }
   }
 }
Example #18
0
  public static void displayPointRanges(Scanner in) {
    double[][] point = getPoints(in, 1);
    long hash, hashUpper;
    double lon, lat, lonUpper, latUpper;

    for (int i = 63; i >= 45; i -= GeoPointField.PRECISION_STEP) {
      BytesRefBuilder brb = new BytesRefBuilder();
      NumericUtils.longToPrefixCoded(
          GeoUtils.mortonHash(point[0][LON_INDEX], point[0][LAT_INDEX]), i, brb);
      BytesRef br = brb.get();
      hash = NumericUtils.prefixCodedToLong(br);
      hashUpper = hash | ((1L << i) - 1);
      lon = GeoUtils.mortonUnhashLon(hash);
      lat = GeoUtils.mortonUnhashLat(hash);
      lonUpper = GeoUtils.mortonUnhashLon(hashUpper);
      latUpper = GeoUtils.mortonUnhashLat(hashUpper);
      System.out.println(
          i + ": " + br + " " + hash + " (" + lon + "," + lat + ")" + " : " + "(" + lonUpper + ","
              + latUpper + ")");
    }
  }
  @Override
  public Fields get(int doc) throws IOException {
    SortedMap<String, SimpleTVTerms> fields = new TreeMap<>();
    in.seek(offsets[doc]);
    readLine();
    assert StringHelper.startsWith(scratch.get(), NUMFIELDS);
    int numFields = parseIntAt(NUMFIELDS.length);
    if (numFields == 0) {
      return null; // no vectors for this doc
    }
    for (int i = 0; i < numFields; i++) {
      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELD);
      // skip fieldNumber:
      parseIntAt(FIELD.length);

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDNAME);
      String fieldName = readString(FIELDNAME.length, scratch);

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDPOSITIONS);
      boolean positions = Boolean.parseBoolean(readString(FIELDPOSITIONS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDOFFSETS);
      boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDPAYLOADS);
      boolean payloads = Boolean.parseBoolean(readString(FIELDPAYLOADS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDTERMCOUNT);
      int termCount = parseIntAt(FIELDTERMCOUNT.length);

      SimpleTVTerms terms = new SimpleTVTerms(offsets, positions, payloads);
      fields.put(fieldName, terms);

      BytesRefBuilder term = new BytesRefBuilder();
      for (int j = 0; j < termCount; j++) {
        readLine();
        assert StringHelper.startsWith(scratch.get(), TERMTEXT);
        int termLength = scratch.length() - TERMTEXT.length;
        term.grow(termLength);
        term.setLength(termLength);
        System.arraycopy(scratch.bytes(), TERMTEXT.length, term.bytes(), 0, termLength);

        SimpleTVPostings postings = new SimpleTVPostings();
        terms.terms.put(term.toBytesRef(), postings);

        readLine();
        assert StringHelper.startsWith(scratch.get(), TERMFREQ);
        postings.freq = parseIntAt(TERMFREQ.length);

        if (positions || offsets) {
          if (positions) {
            postings.positions = new int[postings.freq];
            if (payloads) {
              postings.payloads = new BytesRef[postings.freq];
            }
          }

          if (offsets) {
            postings.startOffsets = new int[postings.freq];
            postings.endOffsets = new int[postings.freq];
          }

          for (int k = 0; k < postings.freq; k++) {
            if (positions) {
              readLine();
              assert StringHelper.startsWith(scratch.get(), POSITION);
              postings.positions[k] = parseIntAt(POSITION.length);
              if (payloads) {
                readLine();
                assert StringHelper.startsWith(scratch.get(), PAYLOAD);
                if (scratch.length() - PAYLOAD.length == 0) {
                  postings.payloads[k] = null;
                } else {
                  byte payloadBytes[] = new byte[scratch.length() - PAYLOAD.length];
                  System.arraycopy(
                      scratch.bytes(), PAYLOAD.length, payloadBytes, 0, payloadBytes.length);
                  postings.payloads[k] = new BytesRef(payloadBytes);
                }
              }
            }

            if (offsets) {
              readLine();
              assert StringHelper.startsWith(scratch.get(), STARTOFFSET);
              postings.startOffsets[k] = parseIntAt(STARTOFFSET.length);

              readLine();
              assert StringHelper.startsWith(scratch.get(), ENDOFFSET);
              postings.endOffsets[k] = parseIntAt(ENDOFFSET.length);
            }
          }
        }
      }
    }
    return new SimpleTVFields(fields);
  }
 @Override
 public BytesRef term() {
   assert !eof;
   return term.get();
 }
  /* Decodes only the term bytes of the next term.  If caller then asks for
  metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
  decode all metadata up to the current term. */
  @Override
  public BytesRef next() throws IOException {

    if (in == null) {
      // Fresh TermsEnum; seek to first term:
      final FST.Arc<Pair<BytesRef, Long>> arc;
      if (fr.index != null) {
        arc = fr.index.getFirstArc(arcs[0]);
        // Empty string prefix must have an output in the index!
        assert arc.isFinal();
      } else {
        arc = null;
      }
      currentFrame = pushFrame(arc, fr.rootCode, 0);
      currentFrame.loadBlock();
    }

    targetBeforeCurrentLength = currentFrame.ord;

    assert !eof;
    // if (DEBUG) {
    // System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + "
    // termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" +
    // currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix);
    // printSeekState();
    // }

    if (currentFrame == staticFrame) {
      // If seek was previously called and the term was
      // cached, or seek(TermState) was called, usually
      // caller is just going to pull a D/&PEnum or get
      // docFreq, etc.  But, if they then call next(),
      // this method catches up all internal state so next()
      // works properly:
      // if (DEBUG) System.out.println("  re-seek to pending term=" + term.utf8ToString() + " " +
      // term);
      final boolean result = seekExact(term.get());
      assert result;
    }

    // Pop finished blocks
    while (currentFrame.nextEnt == currentFrame.entCount) {
      if (!currentFrame.isLastInFloor) {
        currentFrame.loadNextFloorBlock();
      } else {
        // if (DEBUG) System.out.println("  pop frame");
        if (currentFrame.ord == 0) {
          // if (DEBUG) System.out.println("  return null");
          assert setEOF();
          term.clear();
          validIndexPrefix = 0;
          currentFrame.rewind();
          termExists = false;
          return null;
        }
        final long lastFP = currentFrame.fpOrig;
        currentFrame = stack[currentFrame.ord - 1];

        if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) {
          // We popped into a frame that's not loaded
          // yet or not scan'd to the right entry
          currentFrame.scanToFloorFrame(term.get());
          currentFrame.loadBlock();
          currentFrame.scanToSubBlock(lastFP);
        }

        // Note that the seek state (last seek) has been
        // invalidated beyond this depth
        validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix);
        // if (DEBUG) {
        // System.out.println("  reset validIndexPrefix=" + validIndexPrefix);
        // }
      }
    }

    while (true) {
      if (currentFrame.next()) {
        // Push to new block:
        // if (DEBUG) System.out.println("  push frame");
        currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length());
        // This is a "next" frame -- even if it's
        // floor'd we must pretend it isn't so we don't
        // try to scan to the right floor frame:
        currentFrame.isFloor = false;
        // currentFrame.hasTerms = true;
        currentFrame.loadBlock();
      } else {
        // if (DEBUG) System.out.println("  return term=" + term.utf8ToString() + " " + term + "
        // currentFrame.ord=" + currentFrame.ord);
        return term.get();
      }
    }
  }
Example #22
0
      // TODO: we may want an alternate mode here which is
      // "if you are about to return NOT_FOUND I won't use
      // the terms data from that"; eg FuzzyTermsEnum will
      // (usually) just immediately call seek again if we
      // return NOT_FOUND so it's a waste for us to fill in
      // the term that was actually NOT_FOUND
      @Override
      public SeekStatus seekCeil(final BytesRef target) throws IOException {

        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" +
        // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term()
        // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending="
        // + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
        if (didIndexNext) {
          if (nextIndexTerm == null) {
            // System.out.println("  nextIndexTerm=null");
          } else {
            // System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
          }
        }

        boolean doSeek = true;

        // See if we can avoid seeking, because target term
        // is after current term but before next index term:
        if (indexIsCurrent) {

          final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target);

          if (cmp == 0) {
            // Already at the requested term
            return SeekStatus.FOUND;
          } else if (cmp < 0) {

            // Target term is after current term
            if (!didIndexNext) {
              if (indexEnum.next() == -1) {
                nextIndexTerm = null;
              } else {
                nextIndexTerm = indexEnum.term();
              }
              // System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null
              // ? "null" : nextIndexTerm.utf8ToString()));
              didIndexNext = true;
            }

            if (nextIndexTerm == null
                || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
              // Optimization: requested term is within the
              // same term block we are now in; skip seeking
              // (but do scanning):
              doSeek = false;
              // System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null"
              // : nextIndexTerm.utf8ToString()));
            }
          }
        }

        if (doSeek) {
          // System.out.println("  seek");

          // Ask terms index to find biggest indexed term (=
          // first term in a block) that's <= our text:
          in.seek(indexEnum.seek(target));
          boolean result = nextBlock();

          // Block must exist since, at least, the indexed term
          // is in the block:
          assert result;

          indexIsCurrent = true;
          didIndexNext = false;

          if (doOrd) {
            state.ord = indexEnum.ord() - 1;
          }

          term.copyBytes(indexEnum.term());
          // System.out.println("  seek: term=" + term.utf8ToString());
        } else {
          // System.out.println("  skip seek");
          if (state.termBlockOrd == blockTermCount && !nextBlock()) {
            indexIsCurrent = false;
            return SeekStatus.END;
          }
        }

        seekPending = false;

        int common = 0;

        // Scan within block.  We could do this by calling
        // _next() and testing the resulting term, but this
        // is wasteful.  Instead, we first confirm the
        // target matches the common prefix of this block,
        // and then we scan the term bytes directly from the
        // termSuffixesreader's byte[], saving a copy into
        // the BytesRef term per term.  Only when we return
        // do we then copy the bytes into the term.

        while (true) {

          // First, see if target term matches common prefix
          // in this block:
          if (common < termBlockPrefix) {
            final int cmp =
                (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF);
            if (cmp < 0) {

              // TODO: maybe we should store common prefix
              // in block header?  (instead of relying on
              // last term of previous block)

              // Target's prefix is after the common block
              // prefix, so term cannot be in this block
              // but it could be in next block.  We
              // must scan to end-of-block to set common
              // prefix for next block:
              if (state.termBlockOrd < blockTermCount) {
                while (state.termBlockOrd < blockTermCount - 1) {
                  state.termBlockOrd++;
                  state.ord++;
                  termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
                }
                final int suffix = termSuffixesReader.readVInt();
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              }
              state.ord++;

              if (!nextBlock()) {
                indexIsCurrent = false;
                return SeekStatus.END;
              }
              common = 0;

            } else if (cmp > 0) {
              // Target's prefix is before the common prefix
              // of this block, so we position to start of
              // block and return NOT_FOUND:
              assert state.termBlockOrd == 0;

              final int suffix = termSuffixesReader.readVInt();
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              return SeekStatus.NOT_FOUND;
            } else {
              common++;
            }

            continue;
          }

          // Test every term in this block
          while (true) {
            state.termBlockOrd++;
            state.ord++;

            final int suffix = termSuffixesReader.readVInt();

            // We know the prefix matches, so just compare the new suffix:
            final int termLen = termBlockPrefix + suffix;
            int bytePos = termSuffixesReader.getPosition();

            boolean next = false;
            final int limit = target.offset + (termLen < target.length ? termLen : target.length);
            int targetPos = target.offset + termBlockPrefix;
            while (targetPos < limit) {
              final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
              if (cmp < 0) {
                // Current term is still before the target;
                // keep scanning
                next = true;
                break;
              } else if (cmp > 0) {
                // Done!  Current term is after target. Stop
                // here, fill in real term, return NOT_FOUND.
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (!next && target.length <= termLen) {
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);

              if (target.length == termLen) {
                // Done!  Exact match.  Stop here, fill in
                // real term, return FOUND.
                // System.out.println("  FOUND");
                return SeekStatus.FOUND;
              } else {
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (state.termBlockOrd == blockTermCount) {
              // Must pre-fill term for next block's common prefix
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              break;
            } else {
              termSuffixesReader.skipBytes(suffix);
            }
          }

          // The purpose of the terms dict index is to seek
          // the enum to the closest index term before the
          // term we are looking for.  So, we should never
          // cross another index term (besides the first
          // one) while we are scanning:

          assert indexIsCurrent;

          if (!nextBlock()) {
            // System.out.println("  END");
            indexIsCurrent = false;
            return SeekStatus.END;
          }
          common = 0;
        }
      }
Example #23
0
 @Override
 public BytesRef term() {
   return term.get();
 }
  public static NamedList<Integer> getCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String fieldName,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase)
      throws IOException {
    SchemaField schemaField = searcher.getSchema().getField(fieldName);
    FieldType ft = schemaField.getType();
    NamedList<Integer> res = new NamedList<>();

    // TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
    final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();

    final SortedSetDocValues si; // for term lookups only
    OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
    if (multiValued) {
      si = searcher.getLeafReader().getSortedSetDocValues(fieldName);
      if (si instanceof MultiSortedSetDocValues) {
        ordinalMap = ((MultiSortedSetDocValues) si).mapping;
      }
    } else {
      SortedDocValues single = searcher.getLeafReader().getSortedDocValues(fieldName);
      si = single == null ? null : DocValues.singleton(single);
      if (single instanceof MultiSortedDocValues) {
        ordinalMap = ((MultiSortedDocValues) single).mapping;
      }
    }
    if (si == null) {
      return finalize(res, searcher, schemaField, docs, -1, missing);
    }
    if (si.getValueCount() >= Integer.MAX_VALUE) {
      throw new UnsupportedOperationException(
          "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms");
    }

    final BytesRefBuilder prefixRef;
    if (prefix == null) {
      prefixRef = null;
    } else if (prefix.length() == 0) {
      prefix = null;
      prefixRef = null;
    } else {
      prefixRef = new BytesRefBuilder();
      prefixRef.copyChars(prefix);
    }

    int startTermIndex, endTermIndex;
    if (prefix != null) {
      startTermIndex = (int) si.lookupTerm(prefixRef.get());
      if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
      prefixRef.append(UnicodeUtil.BIG_TERM);
      endTermIndex = (int) si.lookupTerm(prefixRef.get());
      assert endTermIndex < 0;
      endTermIndex = -endTermIndex - 1;
    } else {
      startTermIndex = -1;
      endTermIndex = (int) si.getValueCount();
    }

    final int nTerms = endTermIndex - startTermIndex;
    int missingCount = -1;
    final CharsRefBuilder charsRef = new CharsRefBuilder();
    if (nTerms > 0 && docs.size() >= mincount) {

      // count collection array only needs to be as big as the number of terms we are
      // going to collect counts for.
      final int[] counts = new int[nTerms];

      Filter filter = docs.getTopFilter();
      List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
      for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
        LeafReaderContext leaf = leaves.get(subIndex);
        DocIdSet dis =
            filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
        DocIdSetIterator disi = null;
        if (dis != null) {
          disi = dis.iterator();
        }
        if (disi != null) {
          if (multiValued) {
            SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
            if (sub == null) {
              sub = DocValues.emptySortedSet();
            }
            final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
            if (singleton != null) {
              // some codecs may optimize SORTED_SET storage for single-valued fields
              accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap);
            } else {
              accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
            }
          } else {
            SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
            if (sub == null) {
              sub = DocValues.emptySorted();
            }
            accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap);
          }
        }
      }

      if (startTermIndex == -1) {
        missingCount = counts[0];
      }

      // IDEA: we could also maintain a count of "other"... everything that fell outside
      // of the top 'N'

      int off = offset;
      int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

      if (sort.equals(FacetParams.FACET_SORT_COUNT)
          || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
        int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
        maxsize = Math.min(maxsize, nTerms);
        LongPriorityQueue queue =
            new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE);

        int min = mincount - 1; // the smallest value in the top 'N' values
        for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
          int c = counts[i];
          if (contains != null) {
            final BytesRef term = si.lookupOrd(startTermIndex + i);
            if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
              continue;
            }
          }
          if (c > min) {
            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).

            // smaller term numbers sort higher, so subtract the term number instead
            long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i);
            boolean displaced = queue.insert(pair);
            if (displaced) min = (int) (queue.top() >>> 32);
          }
        }

        // if we are deep paging, we don't have to order the highest "offset" counts.
        int collectCount = Math.max(0, queue.size() - off);
        assert collectCount <= lim;

        // the start and end indexes of our list "sorted" (starting with the highest value)
        int sortedIdxStart = queue.size() - (collectCount - 1);
        int sortedIdxEnd = queue.size() + 1;
        final long[] sorted = queue.sort(collectCount);

        for (int i = sortedIdxStart; i < sortedIdxEnd; i++) {
          long pair = sorted[i];
          int c = (int) (pair >>> 32);
          int tnum = Integer.MAX_VALUE - (int) pair;
          final BytesRef term = si.lookupOrd(startTermIndex + tnum);
          ft.indexedToReadable(term, charsRef);
          res.add(charsRef.toString(), c);
        }

      } else {
        // add results in index order
        int i = (startTermIndex == -1) ? 1 : 0;
        if (mincount <= 0 && contains == null) {
          // if mincount<=0 and we're not examining the values for contains, then
          // we won't discard any terms and we know exactly where to start.
          i += off;
          off = 0;
        }

        for (; i < nTerms; i++) {
          int c = counts[i];
          if (c < mincount) continue;
          BytesRef term = null;
          if (contains != null) {
            term = si.lookupOrd(startTermIndex + i);
            if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) {
              continue;
            }
          }
          if (--off >= 0) continue;
          if (--lim < 0) break;
          if (term == null) {
            term = si.lookupOrd(startTermIndex + i);
          }
          ft.indexedToReadable(term, charsRef);
          res.add(charsRef.toString(), c);
        }
      }
    }

    return finalize(res, searcher, schemaField, docs, missingCount, missing);
  }
  @Override
  public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context)
      throws IOException {
    String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
    final IndexInput in = dir.openInput(dataFile, context);

    BytesRefBuilder scratch = new BytesRefBuilder();

    // first get to TOC:
    DecimalFormat df =
        new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT));
    long pos = in.length() - TABLEPOS.length - OFFSETPATTERN.length() - 1;
    in.seek(pos);
    SimpleTextUtil.readLine(in, scratch);
    assert StringHelper.startsWith(scratch.get(), TABLEPOS);
    long tablePos = -1;
    try {
      tablePos = df.parse(stripPrefix(scratch, TABLEPOS)).longValue();
    } catch (ParseException e) {
      throw new CorruptIndexException(
          "can't parse CFS trailer, got: " + scratch.get().utf8ToString(), in);
    }

    // seek to TOC and read it
    in.seek(tablePos);
    SimpleTextUtil.readLine(in, scratch);
    assert StringHelper.startsWith(scratch.get(), TABLE);
    int numEntries = Integer.parseInt(stripPrefix(scratch, TABLE));

    final String fileNames[] = new String[numEntries];
    final long startOffsets[] = new long[numEntries];
    final long endOffsets[] = new long[numEntries];

    for (int i = 0; i < numEntries; i++) {
      SimpleTextUtil.readLine(in, scratch);
      assert StringHelper.startsWith(scratch.get(), TABLENAME);
      fileNames[i] = si.name + IndexFileNames.stripSegmentName(stripPrefix(scratch, TABLENAME));

      if (i > 0) {
        // files must be unique and in sorted order
        assert fileNames[i].compareTo(fileNames[i - 1]) > 0;
      }

      SimpleTextUtil.readLine(in, scratch);
      assert StringHelper.startsWith(scratch.get(), TABLESTART);
      startOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLESTART));

      SimpleTextUtil.readLine(in, scratch);
      assert StringHelper.startsWith(scratch.get(), TABLEEND);
      endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND));
    }

    return new Directory() {

      private int getIndex(String name) throws IOException {
        int index = Arrays.binarySearch(fileNames, name);
        if (index < 0) {
          throw new FileNotFoundException(
              "No sub-file found (fileName="
                  + name
                  + " files: "
                  + Arrays.toString(fileNames)
                  + ")");
        }
        return index;
      }

      @Override
      public String[] listAll() throws IOException {
        ensureOpen();
        return fileNames.clone();
      }

      @Override
      public long fileLength(String name) throws IOException {
        ensureOpen();
        int index = getIndex(name);
        return endOffsets[index] - startOffsets[index];
      }

      @Override
      public IndexInput openInput(String name, IOContext context) throws IOException {
        ensureOpen();
        int index = getIndex(name);
        return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]);
      }

      @Override
      public void close() throws IOException {
        in.close();
      }

      // write methods: disabled

      @Override
      public IndexOutput createOutput(String name, IOContext context) {
        throw new UnsupportedOperationException();
      }

      @Override
      public void sync(Collection<String> names) {
        throw new UnsupportedOperationException();
      }

      @Override
      public void deleteFile(String name) {
        throw new UnsupportedOperationException();
      }

      @Override
      public void renameFile(String source, String dest) {
        throw new UnsupportedOperationException();
      }

      @Override
      public Lock makeLock(String name) {
        throw new UnsupportedOperationException();
      }
    };
  }
 @Override
 public BytesRef getBytesRef() {
   return bytes.get();
 }
    @Override
    protected void doSetNextReader(LeafReaderContext context) throws IOException {
      if (segmentFacetCounts != null) {
        segmentResults.add(createSegmentResult());
      }

      groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField);
      facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField);
      facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount();
      if (facetFieldNumTerms == 0) {
        facetOrdTermsEnum = null;
      } else {
        facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum();
      }
      // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet
      // field
      segmentFacetCounts = new int[facetFieldNumTerms + 1];
      segmentTotalCount = 0;

      segmentGroupedFacetHits.clear();
      for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
        int groupOrd =
            groupedFacetHit.groupValue == null
                ? -1
                : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue);
        if (groupedFacetHit.groupValue != null && groupOrd < 0) {
          continue;
        }

        int facetOrd;
        if (groupedFacetHit.facetValue != null) {
          if (facetOrdTermsEnum == null
              || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) {
            continue;
          }
          facetOrd = (int) facetOrdTermsEnum.ord();
        } else {
          facetOrd = facetFieldNumTerms;
        }

        // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not
        // containing facet field
        int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd;
        segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
      }

      if (facetPrefix != null) {
        TermsEnum.SeekStatus seekStatus;
        if (facetOrdTermsEnum != null) {
          seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix);
        } else {
          seekStatus = TermsEnum.SeekStatus.END;
        }

        if (seekStatus != TermsEnum.SeekStatus.END) {
          startFacetOrd = (int) facetOrdTermsEnum.ord();
        } else {
          startFacetOrd = 0;
          endFacetOrd = 0;
          return;
        }

        BytesRefBuilder facetEndPrefix = new BytesRefBuilder();
        facetEndPrefix.append(facetPrefix);
        facetEndPrefix.append(UnicodeUtil.BIG_TERM);
        seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get());
        if (seekStatus != TermsEnum.SeekStatus.END) {
          endFacetOrd = (int) facetOrdTermsEnum.ord();
        } else {
          endFacetOrd = facetFieldNumTerms; // Don't include null...
        }
      } else {
        startFacetOrd = 0;
        endFacetOrd = facetFieldNumTerms + 1;
      }
    }