Exemplo n.º 1
1
    @Override
    public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException {
      final long numOrds = globalOrdinals.getValueCount();
      final LongBitSet acceptedGlobalOrdinals = new LongBitSet(numOrds);
      final TermsEnum termEnum = globalOrdinals.termsEnum();

      BytesRef term = termEnum.next();
      while (term != null) {
        if (Math.floorMod(
                StringHelper.murmurhash3_x86_32(term, HASH_PARTITIONING_SEED), incNumPartitions)
            == incZeroBasedPartition) {
          acceptedGlobalOrdinals.set(termEnum.ord());
        }
        term = termEnum.next();
      }
      return acceptedGlobalOrdinals;
    }
Exemplo n.º 2
1
 @Override
 public boolean accept(BytesRef value) {
   return Math.floorMod(
           StringHelper.murmurhash3_x86_32(value, HASH_PARTITIONING_SEED), incNumPartitions)
       == incZeroBasedPartition;
 }
Exemplo n.º 3
0
    @Override
    public int nextPosition() throws IOException {
      final int pos;
      if (readPositions) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + POS.length,
            scratch.length - POS.length,
            scratchUTF16_2);
        pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      } else {
        pos = -1;
      }

      if (readOffsets) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, START_OFFSET)
            : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + START_OFFSET.length,
            scratch.length - START_OFFSET.length,
            scratchUTF16_2);
        startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + END_OFFSET.length,
            scratch.length - END_OFFSET.length,
            scratchUTF16_2);
        endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      }

      final long fp = in.getFilePointer();
      SimpleTextUtil.readLine(in, scratch);
      if (StringHelper.startsWith(scratch, PAYLOAD)) {
        final int len = scratch.length - PAYLOAD.length;
        if (scratch2.bytes.length < len) {
          scratch2.grow(len);
        }
        System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len);
        scratch2.length = len;
        payload = scratch2;
      } else {
        payload = null;
        in.seek(fp);
      }
      return pos;
    }
Exemplo n.º 4
0
 @Override
 public int nextDoc() throws IOException {
   if (docID == NO_MORE_DOCS) {
     return docID;
   }
   boolean first = true;
   int termFreq = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       termFreq = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip termFreq++;
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
               || StringHelper.startsWith(scratch, FIELD)
               || StringHelper.startsWith(scratch, END)
           : "scratch=" + scratch.utf8ToString();
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }
  private void getPrefixTerms(
      ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment
    // into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf
    // individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
      Terms _terms = leaf.reader().terms(field);
      if (_terms == null) {
        continue;
      }

      TermsEnum termsEnum = _terms.iterator();
      TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
      if (TermsEnum.SeekStatus.END == seekStatus) {
        continue;
      }

      for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
        if (!StringHelper.startsWith(term, prefix.bytes())) {
          break;
        }

        terms.add(new Term(field, BytesRef.deepCopyOf(term)));
        if (terms.size() >= maxExpansions) {
          return;
        }
      }
    }
  }
Exemplo n.º 6
0
    /**
     * The termCompare method in FuzzyTermEnum uses Levenshtein distance to calculate the distance
     * between the given term and the comparing term.
     *
     * <p>If the minSimilarity is &gt;= 1.0, this uses the maxEdits as the comparison. Otherwise,
     * this method uses the following logic to calculate similarity.
     *
     * <pre>
     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
     *   </pre>
     *
     * where distance is the Levenshtein distance for the two words.
     */
    @Override
    protected final AcceptStatus accept(BytesRef term) {
      if (StringHelper.startsWith(term, prefixBytesRef)) {
        utf32.copyUTF8Bytes(term);
        final int distance =
            calcDistance(utf32.ints(), realPrefixLength, utf32.length() - realPrefixLength);

        // Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
        if (distance == Integer.MIN_VALUE) {
          return AcceptStatus.NO;
        }
        // no need to calc similarity, if raw is true and distance > maxEdits
        if (raw == true && distance > maxEdits) {
          return AcceptStatus.NO;
        }
        final float similarity =
            calcSimilarity(distance, (utf32.length() - realPrefixLength), text.length);

        // if raw is true, then distance must also be <= maxEdits by now
        // given the previous if statement
        if (raw == true || (raw == false && similarity > minSimilarity)) {
          boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
          return AcceptStatus.YES;
        } else {
          return AcceptStatus.NO;
        }
      } else {
        return AcceptStatus.END;
      }
    }
 private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor)
     throws IOException {
   readLine();
   assert StringHelper.startsWith(scratch.get(), VALUE);
   if (type == TYPE_STRING) {
     byte[] bytes = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
     visitor.stringField(fieldInfo, bytes);
   } else if (type == TYPE_BINARY) {
     byte[] copy = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);
     visitor.binaryField(fieldInfo, copy);
   } else if (type == TYPE_INT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.intField(fieldInfo, Integer.parseInt(scratchUTF16.toString()));
   } else if (type == TYPE_LONG) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.longField(fieldInfo, Long.parseLong(scratchUTF16.toString()));
   } else if (type == TYPE_FLOAT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.floatField(fieldInfo, Float.parseFloat(scratchUTF16.toString()));
   } else if (type == TYPE_DOUBLE) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.doubleField(fieldInfo, Double.parseDouble(scratchUTF16.toString()));
   }
 }
  /** Test attributes map */
  public void testAttributes() throws Exception {
    Directory dir = newDirectory();
    Codec codec = getCodec();
    byte id[] = StringHelper.randomId();
    Map<String, String> attributes = new HashMap<>();
    attributes.put("key1", "value1");
    attributes.put("key2", "value2");
    SegmentInfo info =
        new SegmentInfo(
            dir,
            getVersions()[0],
            "_123",
            1,
            false,
            codec,
            Collections.emptyMap(),
            id,
            attributes,
            null);
    info.setFiles(Collections.<String>emptySet());
    codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
    SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT);
    assertEquals(attributes, info2.getAttributes());

    // attributes map should be immutable
    expectThrows(
        UnsupportedOperationException.class,
        () -> {
          info2.getAttributes().put("bogus", "bogus");
        });

    dir.close();
  }
Exemplo n.º 9
0
 @Override
 public int nextDoc() throws IOException {
   boolean first = true;
   in.seek(nextDocStart);
   long posStart = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     // System.out.println("NEXT DOC: " + scratch.utf8ToString());
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       tf = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       posStart = in.getFilePointer();
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
           || StringHelper.startsWith(scratch, FIELD)
           || StringHelper.startsWith(scratch, END);
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }
  @Override
  public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException {
    in.seek(offsets[n]);

    while (true) {
      readLine();
      if (StringHelper.startsWith(scratch.get(), FIELD) == false) {
        break;
      }
      int fieldNumber = parseIntAt(FIELD.length);
      FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
      readLine();
      assert StringHelper.startsWith(scratch.get(), NAME);
      readLine();
      assert StringHelper.startsWith(scratch.get(), TYPE);

      final BytesRef type;
      if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) {
        type = TYPE_STRING;
      } else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) {
        type = TYPE_BINARY;
      } else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) {
        type = TYPE_INT;
      } else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) {
        type = TYPE_LONG;
      } else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) {
        type = TYPE_FLOAT;
      } else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) {
        type = TYPE_DOUBLE;
      } else {
        throw new RuntimeException("unknown field type");
      }

      switch (visitor.needsField(fieldInfo)) {
        case YES:
          readField(type, fieldInfo, visitor);
          break;
        case NO:
          readLine();
          assert StringHelper.startsWith(scratch.get(), VALUE);
          break;
        case STOP:
          return;
      }
    }
  }
Exemplo n.º 11
0
  /** initialize native resources */
  public static void initializeNatives(
      Path tmpFile, boolean mlockAll, boolean seccomp, boolean ctrlHandler) {
    final ESLogger logger = Loggers.getLogger(Bootstrap.class);

    // check if the user is running as root, and bail
    if (Natives.definitelyRunningAsRoot()) {
      if (Boolean.parseBoolean(System.getProperty("es.insecure.allow.root"))) {
        logger.warn("running as ROOT user. this is a bad idea!");
      } else {
        throw new RuntimeException("don't run elasticsearch as root.");
      }
    }

    // enable secure computing mode
    if (seccomp) {
      Natives.trySeccomp(tmpFile);
    }

    // mlockall if requested
    if (mlockAll) {
      if (Constants.WINDOWS) {
        Natives.tryVirtualLock();
      } else {
        Natives.tryMlockall();
      }
    }

    // listener for windows close event
    if (ctrlHandler) {
      Natives.addConsoleCtrlHandler(
          new ConsoleCtrlHandler() {
            @Override
            public boolean handle(int code) {
              if (CTRL_CLOSE_EVENT == code) {
                logger.info("running graceful exit on windows");
                try {
                  Bootstrap.stop();
                } catch (IOException e) {
                  throw new ElasticsearchException("failed to stop node", e);
                }
                return true;
              }
              return false;
            }
          });
    }

    // force remainder of JNA to be loaded (if available).
    try {
      JNAKernel32Library.getInstance();
    } catch (Throwable ignored) {
      // we've already logged this.
    }

    // init lucene random seed. it will use /dev/urandom where available:
    StringHelper.randomId();
  }
Exemplo n.º 12
0
 private BytesRef setTerm() throws IOException {
   term = termsEnum.term();
   // System.out.println("  setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix ==
   // null ? "null" : prefix.utf8ToString()));
   if (prefix != null && !StringHelper.startsWith(term, prefix)) {
     term = null;
   }
   return term;
 }
Exemplo n.º 13
0
  private void writeField() throws IOException {
    // remember where this field is written
    currentField.tvfPointer = tvf.getFilePointer();
    // System.out.println("Field Pointer: " + currentField.tvfPointer);

    final int size = terms.size();
    tvf.writeVInt(size);

    boolean storePositions = currentField.storePositions;
    boolean storeOffsets = currentField.storeOffsets;
    byte bits = 0x0;
    if (storePositions) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
    if (storeOffsets) bits |= STORE_OFFSET_WITH_TERMVECTOR;
    tvf.writeByte(bits);

    String lastTermText = "";
    for (int i = 0; i < size; i++) {
      TVTerm term = (TVTerm) terms.elementAt(i);
      int start = StringHelper.stringDifference(lastTermText, term.termText);
      int length = term.termText.length() - start;
      tvf.writeVInt(start); // write shared prefix length
      tvf.writeVInt(length); // write delta length
      tvf.writeChars(term.termText, start, length); // write delta chars
      tvf.writeVInt(term.freq);
      lastTermText = term.termText;

      if (storePositions) {
        if (term.positions == null)
          throw new IllegalStateException("Trying to write positions that are null!");

        // use delta encoding for positions
        int position = 0;
        for (int j = 0; j < term.freq; j++) {
          tvf.writeVInt(term.positions[j] - position);
          position = term.positions[j];
        }
      }

      if (storeOffsets) {
        if (term.offsets == null)
          throw new IllegalStateException("Trying to write offsets that are null!");

        // use delta encoding for offsets
        int position = 0;
        for (int j = 0; j < term.freq; j++) {
          tvf.writeVInt(term.offsets[j].getStartOffset() - position);
          tvf.writeVInt(
              term.offsets[j].getEndOffset()
                  - term.offsets[j].getStartOffset()); // Save the diff between the two.
          position = term.offsets[j].getEndOffset();
        }
      }
    }
  }
Exemplo n.º 14
0
 @Override
 public boolean isPrefixOf(Cell c) {
   NRCell otherCell = (NRCell) c;
   assert term != otherCell.term;
   // trick to re-use bytesref; provided that we re-instate it
   int myLastLen = term.length;
   term.length = termLenByLevel[getLevel()];
   int otherLastLen = otherCell.term.length;
   otherCell.term.length = termLenByLevel[otherCell.getLevel()];
   boolean answer = StringHelper.startsWith(otherCell.term, term);
   term.length = myLastLen;
   otherCell.term.length = otherLastLen;
   return answer;
 }
 // we don't actually write a .fdx-like index, instead we read the
 // stored fields file in entirety up-front and save the offsets
 // so we can seek to the documents later.
 private void readIndex(int size) throws IOException {
   ChecksumIndexInput input = new BufferedChecksumIndexInput(in);
   offsets = new long[size];
   int upto = 0;
   while (!scratch.get().equals(END)) {
     SimpleTextUtil.readLine(input, scratch);
     if (StringHelper.startsWith(scratch.get(), DOC)) {
       offsets[upto] = input.getFilePointer();
       upto++;
     }
   }
   SimpleTextUtil.checkFooter(input);
   assert upto == offsets.length;
 }
Exemplo n.º 16
0
 /**
  * Returns true if the term matches the automaton. Also stashes away the term to assist with smart
  * enumeration.
  */
 @Override
 protected AcceptStatus accept(final BytesRef term) {
   if (commonSuffixRef == null || StringHelper.endsWith(term, commonSuffixRef)) {
     if (runAutomaton.run(term.bytes, term.offset, term.length))
       return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK;
     else
       return (linear && termComp.compare(term, linearUpperBound) < 0)
           ? AcceptStatus.NO
           : AcceptStatus.NO_AND_SEEK;
   } else {
     return (linear && termComp.compare(term, linearUpperBound) < 0)
         ? AcceptStatus.NO
         : AcceptStatus.NO_AND_SEEK;
   }
 }
Exemplo n.º 17
0
 @Override
 public String next() throws IOException {
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END)) {
       current = null;
       return null;
     }
     if (StringHelper.startsWith(scratch, FIELD)) {
       return current =
           new String(
               scratch.bytes,
               scratch.offset + FIELD.length,
               scratch.length - FIELD.length,
               "UTF-8");
     }
   }
 }
  /**
   * Test segment infos read that hits exception on close make sure we get our exception back, no
   * file handle leaks, etc.
   */
  public void testExceptionOnCloseInput() throws Exception {
    Failure fail =
        new Failure() {
          @Override
          public void eval(MockDirectoryWrapper dir) throws IOException {
            for (StackTraceElement e : Thread.currentThread().getStackTrace()) {
              if (doFail && "close".equals(e.getMethodName())) {
                throw new FakeIOException();
              }
            }
          }
        };

    MockDirectoryWrapper dir = newMockDirectory();
    dir.failOn(fail);
    Codec codec = getCodec();
    byte id[] = StringHelper.randomId();
    SegmentInfo info =
        new SegmentInfo(
            dir,
            getVersions()[0],
            "_123",
            1,
            false,
            codec,
            Collections.<String, String>emptyMap(),
            id,
            new HashMap<>(),
            null);
    info.setFiles(Collections.<String>emptySet());
    codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);

    fail.setDoFail();
    expectThrows(
        FakeIOException.class,
        () -> {
          codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT);
        });
    fail.clearDoFail();

    dir.close();
  }
 /** Test files map */
 public void testFiles() throws Exception {
   Directory dir = newDirectory();
   Codec codec = getCodec();
   byte id[] = StringHelper.randomId();
   SegmentInfo info =
       new SegmentInfo(
           dir,
           getVersions()[0],
           "_123",
           1,
           false,
           codec,
           Collections.<String, String>emptyMap(),
           id,
           new HashMap<>(),
           null);
   info.setFiles(Collections.<String>emptySet());
   codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
   SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT);
   assertEquals(info.files(), info2.files());
   dir.close();
 }
  /** Test sort */
  public void testSort() throws IOException {
    assumeTrue("test requires a codec that can read/write index sort", supportsIndexSort());

    final int iters = atLeast(5);
    for (int i = 0; i < iters; ++i) {
      Sort sort;
      if (i == 0) {
        sort = null;
      } else {
        final int numSortFields = TestUtil.nextInt(random(), 1, 3);
        SortField[] sortFields = new SortField[numSortFields];
        for (int j = 0; j < numSortFields; ++j) {
          sortFields[j] = randomIndexSortField();
        }
        sort = new Sort(sortFields);
      }

      Directory dir = newDirectory();
      Codec codec = getCodec();
      byte id[] = StringHelper.randomId();
      SegmentInfo info =
          new SegmentInfo(
              dir,
              getVersions()[0],
              "_123",
              1,
              false,
              codec,
              Collections.<String, String>emptyMap(),
              id,
              new HashMap<>(),
              sort);
      info.setFiles(Collections.<String>emptySet());
      codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
      SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT);
      assertEquals(sort, info2.getIndexSort());
      dir.close();
    }
  }
Exemplo n.º 21
0
  @Override
  public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
      throws IOException {
    int prefixLength = prefix.length();
    Terms terms = MultiFields.getTerms(reader, fieldName);
    if (terms != null) {
      Matcher matcher = pattern.matcher("");
      try {
        TermsEnum termsEnum = terms.iterator(null);

        TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
        BytesRef text;
        if (status == TermsEnum.SeekStatus.FOUND) {
          text = prefixRef;
        } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
          text = termsEnum.term();
        } else {
          text = null;
        }

        while (text != null) {
          if (text != null && StringHelper.startsWith(text, prefixRef)) {
            String textString = text.utf8ToString();
            matcher.reset(textString.substring(prefixLength));
            if (matcher.matches()) {
              mtv.visitMatchingTerm(new Term(fieldName, textString));
            }
          } else {
            break;
          }
          text = termsEnum.next();
        }
      } finally {
        matcher.reset();
      }
    }
  }
  /** Tests SI writer adds itself to files... */
  public void testAddsSelfToFiles() throws Exception {
    Directory dir = newDirectory();
    Codec codec = getCodec();
    byte id[] = StringHelper.randomId();
    SegmentInfo info =
        new SegmentInfo(
            dir,
            getVersions()[0],
            "_123",
            1,
            false,
            codec,
            Collections.<String, String>emptyMap(),
            id,
            new HashMap<>(),
            null);
    Set<String> originalFiles = Collections.singleton("_123.a");
    info.setFiles(originalFiles);
    codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);

    Set<String> modifiedFiles = info.files();
    assertTrue(modifiedFiles.containsAll(originalFiles));
    assertTrue(
        "did you forget to add yourself to files()", modifiedFiles.size() > originalFiles.size());

    SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT);
    assertEquals(info.files(), info2.files());

    // files set should be immutable
    expectThrows(
        UnsupportedOperationException.class,
        () -> {
          info2.files().add("bogus");
        });

    dir.close();
  }
Exemplo n.º 23
0
  /**
   * Returns a list of terms in the specified field along with the corresponding count of documents
   * in the set that match that constraint. This method uses the FilterCache to get the intersection
   * count between <code>docs</code> and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
  public NamedList<Integer> getFacetTermEnumCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String field,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase,
      SolrParams params)
      throws IOException {

    /* :TODO: potential optimization...
     * cache the Terms with the highest docFreq and try them first
     * don't enum if we get our max from them
     */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
      SortedIntDocSet sset = (SortedIntDocSet) docs;
      fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }

    IndexSchema schema = searcher.getSchema();
    LeafReader r = searcher.getLeafReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
        sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef prefixTermBytes = null;
    if (prefix != null) {
      String indexedPrefix = ft.toInternal(prefix);
      prefixTermBytes = new BytesRef(indexedPrefix);
    }

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (prefixTermBytes != null) {
        if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }
    }

    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();

    if (docs.size() >= mincount) {
      while (term != null) {

        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;

        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
          int df = termsEnum.docFreq();

          // If we are sorting, we can use df>min (rather than >=) since we
          // are going in index order.  For certain term distributions this can
          // make a large difference (for example, many terms with df=1).
          if (df > 0 && df > min) {
            int c;

            if (df >= minDfFilterCache) {
              // use the filter cache

              if (deState == null) {
                deState = new SolrIndexSearcher.DocsEnumState();
                deState.fieldName = field;
                deState.liveDocs = r.getLiveDocs();
                deState.termsEnum = termsEnum;
                deState.postingsEnum = postingsEnum;
              }

              c = searcher.numDocs(docs, deState);

              postingsEnum = deState.postingsEnum;
            } else {
              // iterate over TermDocs to calculate the intersection

              // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it
              // matter for this?
              // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
              // impl)
              // TODO: would passing deleted docs lead to better efficiency over checking the
              // fastForRandomSet?
              postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
              c = 0;

              if (postingsEnum instanceof MultiPostingsEnum) {
                MultiPostingsEnum.EnumWithSlice[] subs =
                    ((MultiPostingsEnum) postingsEnum).getSubs();
                int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                for (int subindex = 0; subindex < numSubs; subindex++) {
                  MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                  if (sub.postingsEnum == null) continue;
                  int base = sub.slice.start;
                  int docid;
                  while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    if (fastForRandomSet.exists(docid + base)) c++;
                  }
                }
              } else {
                int docid;
                while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.exists(docid)) c++;
                }
              }
            }

            if (sortByCount) {
              if (c > min) {
                BytesRef termCopy = BytesRef.deepCopyOf(term);
                queue.add(new CountPair<>(termCopy, c));
                if (queue.size() >= maxsize) min = queue.last().val;
              }
            } else {
              if (c >= mincount && --off < 0) {
                if (--lim < 0) break;
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
              }
            }
          }
        }
        term = termsEnum.next();
      }
    }

    if (sortByCount) {
      for (CountPair<BytesRef, Integer> p : queue) {
        if (--off >= 0) continue;
        if (--lim < 0) break;
        ft.indexedToReadable(p.key, charsRef);
        res.add(charsRef.toString(), p.val);
      }
    }

    if (missing) {
      res.add(null, getFieldMissingCount(searcher, docs, field));
    }

    return res;
  }
Exemplo n.º 24
0
  public void testMerge() throws IOException {
    final Codec codec = Codec.getDefault();
    final SegmentInfo si =
        new SegmentInfo(
            mergedDir,
            Version.LATEST,
            mergedSegment,
            -1,
            false,
            codec,
            Collections.emptyMap(),
            StringHelper.randomId(),
            new HashMap<>());

    SegmentMerger merger =
        new SegmentMerger(
            Arrays.<CodecReader>asList(reader1, reader2),
            si,
            InfoStream.getDefault(),
            mergedDir,
            new FieldInfos.FieldNumbers(),
            newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1))));
    MergeState mergeState = merger.merge();
    int docsMerged = mergeState.segmentInfo.maxDoc();
    assertTrue(docsMerged == 2);
    // Should be able to open a new SegmentReader against the new directory
    SegmentReader mergedReader =
        new SegmentReader(
            new SegmentCommitInfo(mergeState.segmentInfo, 0, -1L, -1L, -1L),
            newIOContext(random()));
    assertTrue(mergedReader != null);
    assertTrue(mergedReader.numDocs() == 2);
    Document newDoc1 = mergedReader.document(0);
    assertTrue(newDoc1 != null);
    // There are 2 unstored fields on the document
    assertTrue(
        DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - DocHelper.unstored.size());
    Document newDoc2 = mergedReader.document(1);
    assertTrue(newDoc2 != null);
    assertTrue(
        DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - DocHelper.unstored.size());

    PostingsEnum termDocs =
        TestUtil.docs(
            random(), mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), null, 0);
    assertTrue(termDocs != null);
    assertTrue(termDocs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

    int tvCount = 0;
    for (FieldInfo fieldInfo : mergedReader.getFieldInfos()) {
      if (fieldInfo.hasVectors()) {
        tvCount++;
      }
    }

    // System.out.println("stored size: " + stored.size());
    assertEquals("We do not have 3 fields that were indexed with term vector", 3, tvCount);

    Terms vector = mergedReader.getTermVectors(0).terms(DocHelper.TEXT_FIELD_2_KEY);
    assertNotNull(vector);
    assertEquals(3, vector.size());
    TermsEnum termsEnum = vector.iterator();

    int i = 0;
    while (termsEnum.next() != null) {
      String term = termsEnum.term().utf8ToString();
      int freq = (int) termsEnum.totalTermFreq();
      // System.out.println("Term: " + term + " Freq: " + freq);
      assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
      assertTrue(DocHelper.FIELD_2_FREQS[i] == freq);
      i++;
    }

    TestSegmentReader.checkNorms(mergedReader);
    mergedReader.close();
  }
 public JSONWriter(Writer writer, SolrQueryRequest req, SolrQueryResponse rsp) {
   super(writer, req, rsp);
   namedListStyle = StringHelper.intern(req.getParams().get(JSON_NL_STYLE, JSON_NL_FLAT));
   wrapperFunction = req.getParams().get(JSON_WRAPPER_FUNCTION);
 }
Exemplo n.º 26
0
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (params.getBool(TermsParams.TERMS, false)) {
      String lowerStr = params.get(TermsParams.TERMS_LOWER, null);
      String[] fields = params.getParams(TermsParams.TERMS_FIELD);
      if (fields != null && fields.length > 0) {
        NamedList terms = new SimpleOrderedMap();
        rb.rsp.add("terms", terms);
        int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
        if (limit < 0) {
          limit = Integer.MAX_VALUE;
        }
        String upperStr = params.get(TermsParams.TERMS_UPPER);
        boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
        boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
        boolean sort =
            !TermsParams.TERMS_SORT_INDEX.equals(
                params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
        int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); // initialize freqmin
        int freqmax =
            params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); // initialize freqmax
        if (freqmax < 0) {
          freqmax = Integer.MAX_VALUE;
        }
        String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
        String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
        Pattern pattern =
            regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

        boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
        for (int j = 0; j < fields.length; j++) {
          String field = StringHelper.intern(fields[j]);
          FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
          if (ft == null) ft = new StrField();

          // If no lower bound was specified, use the prefix
          String lower = lowerStr == null ? prefix : (raw ? lowerStr : ft.toInternal(lowerStr));
          if (lower == null) lower = "";
          String upper = upperStr == null ? null : (raw ? upperStr : ft.toInternal(upperStr));

          Term lowerTerm = new Term(field, lower);
          Term upperTerm = upper == null ? null : new Term(field, upper);

          TermEnum termEnum =
              rb.req
                  .getSearcher()
                  .getReader()
                  .terms(lowerTerm); // this will be positioned ready to go
          int i = 0;
          BoundedTreeSet<CountPair<String, Integer>> queue =
              (sort ? new BoundedTreeSet<CountPair<String, Integer>>(limit) : null);
          NamedList fieldTerms = new NamedList();
          terms.add(field, fieldTerms);
          Term lowerTestTerm = termEnum.term();

          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerTestTerm != null
              && lowerIncl == false
              && lowerTestTerm.field() == field // intern'd comparison
              && lowerTestTerm.text().equals(lower)) {
            termEnum.next();
          }

          while (i < limit || sort) {

            Term theTerm = termEnum.term();

            // check for a different field, or the end of the index.
            if (theTerm == null || field != theTerm.field()) // intern'd comparison
            break;

            String indexedText = theTerm.text();

            // stop if the prefix doesn't match
            if (prefix != null && !indexedText.startsWith(prefix)) break;

            if (pattern != null && !pattern.matcher(indexedText).matches()) {
              termEnum.next();
              continue;
            }

            if (upperTerm != null) {
              int upperCmp = theTerm.compareTo(upperTerm);
              // if we are past the upper term, or equal to it (when don't include upper) then stop.
              if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;
            }

            // This is a good term in the range.  Check if mincount/maxcount conditions are
            // satisfied.
            int docFreq = termEnum.docFreq();
            if (docFreq >= freqmin && docFreq <= freqmax) {
              // add the term to the list
              String label = raw ? indexedText : ft.indexedToReadable(indexedText);
              if (sort) {
                queue.add(new CountPair<String, Integer>(label, docFreq));
              } else {
                fieldTerms.add(label, docFreq);
                i++;
              }
            }

            termEnum.next();
          }

          termEnum.close();

          if (sort) {
            for (CountPair<String, Integer> item : queue) {
              if (i < limit) {
                fieldTerms.add(item.key, item.val);
                i++;
              } else {
                break;
              }
            }
          }
        }
      } else {
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST, "No terms.fl parameter specified");
      }
    }
  }
  @Override
  public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context)
      throws IOException {
    String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
    final IndexInput in = dir.openInput(dataFile, context);

    BytesRefBuilder scratch = new BytesRefBuilder();

    // first get to TOC:
    DecimalFormat df =
        new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT));
    long pos = in.length() - TABLEPOS.length - OFFSETPATTERN.length() - 1;
    in.seek(pos);
    SimpleTextUtil.readLine(in, scratch);
    assert StringHelper.startsWith(scratch.get(), TABLEPOS);
    long tablePos = -1;
    try {
      tablePos = df.parse(stripPrefix(scratch, TABLEPOS)).longValue();
    } catch (ParseException e) {
      throw new CorruptIndexException(
          "can't parse CFS trailer, got: " + scratch.get().utf8ToString(), in);
    }

    // seek to TOC and read it
    in.seek(tablePos);
    SimpleTextUtil.readLine(in, scratch);
    assert StringHelper.startsWith(scratch.get(), TABLE);
    int numEntries = Integer.parseInt(stripPrefix(scratch, TABLE));

    final String fileNames[] = new String[numEntries];
    final long startOffsets[] = new long[numEntries];
    final long endOffsets[] = new long[numEntries];

    for (int i = 0; i < numEntries; i++) {
      SimpleTextUtil.readLine(in, scratch);
      assert StringHelper.startsWith(scratch.get(), TABLENAME);
      fileNames[i] = si.name + IndexFileNames.stripSegmentName(stripPrefix(scratch, TABLENAME));

      if (i > 0) {
        // files must be unique and in sorted order
        assert fileNames[i].compareTo(fileNames[i - 1]) > 0;
      }

      SimpleTextUtil.readLine(in, scratch);
      assert StringHelper.startsWith(scratch.get(), TABLESTART);
      startOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLESTART));

      SimpleTextUtil.readLine(in, scratch);
      assert StringHelper.startsWith(scratch.get(), TABLEEND);
      endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND));
    }

    return new Directory() {

      private int getIndex(String name) throws IOException {
        int index = Arrays.binarySearch(fileNames, name);
        if (index < 0) {
          throw new FileNotFoundException(
              "No sub-file found (fileName="
                  + name
                  + " files: "
                  + Arrays.toString(fileNames)
                  + ")");
        }
        return index;
      }

      @Override
      public String[] listAll() throws IOException {
        ensureOpen();
        return fileNames.clone();
      }

      @Override
      public long fileLength(String name) throws IOException {
        ensureOpen();
        int index = getIndex(name);
        return endOffsets[index] - startOffsets[index];
      }

      @Override
      public IndexInput openInput(String name, IOContext context) throws IOException {
        ensureOpen();
        int index = getIndex(name);
        return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]);
      }

      @Override
      public void close() throws IOException {
        in.close();
      }

      // write methods: disabled

      @Override
      public IndexOutput createOutput(String name, IOContext context) {
        throw new UnsupportedOperationException();
      }

      @Override
      public void sync(Collection<String> names) {
        throw new UnsupportedOperationException();
      }

      @Override
      public void deleteFile(String name) {
        throw new UnsupportedOperationException();
      }

      @Override
      public void renameFile(String source, String dest) {
        throw new UnsupportedOperationException();
      }

      @Override
      public Lock makeLock(String name) {
        throw new UnsupportedOperationException();
      }
    };
  }
Exemplo n.º 28
0
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false)) return;

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0) return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
      limit = Integer.MAX_VALUE;
    }

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort =
        !TermsParams.TERMS_SORT_INDEX.equals(
            params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
      freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
      NamedList<Integer> fieldTerms = new NamedList<>();
      termsResult.add(field, fieldTerms);

      Terms terms = lfields == null ? null : lfields.terms(field);
      if (terms == null) {
        // no terms for this field
        continue;
      }

      FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
      if (ft == null) ft = new StrField();

      // prefix must currently be text
      BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

      BytesRef upperBytes = null;
      if (upperStr != null) {
        upperBytes = new BytesRef();
        ft.readableToIndexed(upperStr, upperBytes);
      }

      BytesRef lowerBytes;
      if (lowerStr == null) {
        // If no lower bound was specified, use the prefix
        lowerBytes = prefixBytes;
      } else {
        lowerBytes = new BytesRef();
        if (raw) {
          // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
          // perhaps we detect if the FieldType is non-character and expect hex if so?
          lowerBytes = new BytesRef(lowerStr);
        } else {
          lowerBytes = new BytesRef();
          ft.readableToIndexed(lowerStr, lowerBytes);
        }
      }

      TermsEnum termsEnum = terms.iterator(null);
      BytesRef term = null;

      if (lowerBytes != null) {
        if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerIncl == false && term.equals(lowerBytes)) {
            term = termsEnum.next();
          }
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }

      int i = 0;
      BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
          (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
      CharsRef external = new CharsRef();
      while (term != null && (i < limit || sort)) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // stop if the prefix doesn't match
        if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break;

        if (pattern != null) {
          // indexed text or external text?
          // TODO: support "raw" mode?
          ft.indexedToReadable(term, external);
          externalized = true;
          if (!pattern.matcher(external).matches()) {
            term = termsEnum.next();
            continue;
          }
        }

        if (upperBytes != null) {
          int upperCmp = term.compareTo(upperBytes);
          // if we are past the upper term, or equal to it (when don't include upper) then stop.
          if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;
        }

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        if (docFreq >= freqmin && docFreq <= freqmax) {
          // add the term to the list
          if (sort) {
            queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
          } else {

            // TODO: handle raw somehow
            if (!externalized) {
              ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
            i++;
          }
        }

        term = termsEnum.next();
      }

      if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
          if (i >= limit) break;
          ft.indexedToReadable(item.key, external);
          fieldTerms.add(external.toString(), item.val);
          i++;
        }
      }
    }
  }
Exemplo n.º 29
0
 private void loadTerms() throws IOException {
   PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
   final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b;
   final PairOutputs<Long, Long> outputsInner =
       new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs);
   final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
       new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner);
   b =
       new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>(
           FST.INPUT_TYPE.BYTE1, outputs);
   IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
   in.seek(termsStart);
   final BytesRef lastTerm = new BytesRef(10);
   long lastDocsStart = -1;
   int docFreq = 0;
   long totalTermFreq = 0;
   OpenBitSet visitedDocs = new OpenBitSet();
   final IntsRef scratchIntsRef = new IntsRef();
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
         sumTotalTermFreq += totalTermFreq;
       }
       break;
     } else if (StringHelper.startsWith(scratch, DOC)) {
       docFreq++;
       sumDocFreq++;
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       visitedDocs.set(docID);
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, TERM)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
       }
       lastDocsStart = in.getFilePointer();
       final int len = scratch.length - TERM.length;
       if (len > lastTerm.length) {
         lastTerm.grow(len);
       }
       System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
       lastTerm.length = len;
       docFreq = 0;
       sumTotalTermFreq += totalTermFreq;
       totalTermFreq = 0;
       termCount++;
     }
   }
   docCount = (int) visitedDocs.cardinality();
   fst = b.finish();
   /*
   PrintStream ps = new PrintStream("out.dot");
   fst.toDot(ps);
   ps.close();
   System.out.println("SAVED out.dot");
   */
   // System.out.println("FST " + fst.sizeInBytes());
 }
Exemplo n.º 30
0
  /** Call this only once (if you subclass!) */
  protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix)
      throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
      throw new IllegalStateException(
          "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }
    // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index =
        new int
            [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last
    // number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes =
        new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
      // No terms
      return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    // System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
      // No terms match
      return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (; ; ) {
      final BytesRef t = te.term();
      if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
        break;
      }
      // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

      visitTerm(te, termNum);

      if ((termNum & indexIntervalMask) == 0) {
        // Index this term
        sizeOfIndexedStrings += t.length;
        BytesRef indexedTerm = new BytesRef();
        indexedTermsBytes.copy(t, indexedTerm);
        // TODO: really should 1) strip off useless suffix,
        // and 2) use FST not array/PagedBytes
        indexedTerms.add(indexedTerm);
      }

      final int df = te.docFreq();
      if (df <= maxTermDocFreq) {

        postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

        // dF, but takes deletions into account
        int actualDF = 0;

        for (; ; ) {
          int doc = postingsEnum.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }
          // System.out.println("  chunk=" + chunk + " docs");

          actualDF++;
          termInstances++;

          // System.out.println("    docID=" + doc);
          // add TNUM_OFFSET to the term number to make room for special reserved values:
          // 0 (end term) and 1 (index into byte array follows)
          int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
          lastTerm[doc] = termNum;
          int val = index[doc];

          if ((val & 0xff) == 1) {
            // index into byte array (actually the end of
            // the doc-specific byte[] when building)
            int pos = val >>> 8;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos + ilen;
            if (newend > arr.length) {
              // We avoid a doubling strategy to lower memory usage.
              // this faceting method isn't for docs with many terms.
              // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit
              // boundary.
              // TODO: figure out what array lengths we can round up to w/o actually using more
              // memory
              // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
              // It should be safe to round up to the nearest 32 bits in any case.
              int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
              byte[] newarr = new byte[newLen];
              System.arraycopy(arr, 0, newarr, 0, pos);
              arr = newarr;
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
            int ipos;
            if (val == 0) {
              ipos = 0;
            } else if ((val & 0x0000ff80) == 0) {
              ipos = 1;
            } else if ((val & 0x00ff8000) == 0) {
              ipos = 2;
            } else if ((val & 0xff800000) == 0) {
              ipos = 3;
            } else {
              ipos = 4;
            }

            // System.out.println("      ipos=" + ipos);

            int endPos = writeInt(delta, tempArr, ipos);
            // System.out.println("      endpos=" + endPos);
            if (endPos <= 4) {
              // System.out.println("      fits!");
              // value will fit in the integer... move bytes back
              for (int j = ipos; j < endPos; j++) {
                val |= (tempArr[j] & 0xff) << (j << 3);
              }
              index[doc] = val;
            } else {
              // value won't fit... move integer into byte[]
              for (int j = 0; j < ipos; j++) {
                tempArr[j] = (byte) val;
                val >>>= 8;
              }
              // point at the end index in the byte[]
              index[doc] = (endPos << 8) | 1;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
          }
        }
        setActualDocFreq(termNum, actualDF);
      }

      termNum++;
      if (te.next() == null) {
        break;
      }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
      // we didn't invert anything
      // lower memory consumption.
      tnums = null;
    } else {

      this.index = index;

      //
      // transform intermediate form into the final form, building a single byte[]
      // at a time, and releasing the intermediate byte[]s as we go to avoid
      // increasing the memory footprint.
      //

      for (int pass = 0; pass < 256; pass++) {
        byte[] target = tnums[pass];
        int pos = 0; // end in target;
        if (target != null) {
          pos = target.length;
        } else {
          target = new byte[4096];
        }

        // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
        // where pp is the pass (which array we are building), and xx is all values.
        // each pass shares the same byte[] for termNumber lists.
        for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
          int lim = Math.min(docbase + (1 << 16), maxDoc);
          for (int doc = docbase; doc < lim; doc++) {
            // System.out.println("  pass="******" process docID=" + doc);
            int val = index[doc];
            if ((val & 0xff) == 1) {
              int len = val >>> 8;
              // System.out.println("    ptr pos=" + pos);
              index[doc] = (pos << 8) | 1; // change index to point to start of array
              if ((pos & 0xff000000) != 0) {
                // we only have 24 bits for the array index
                throw new IllegalStateException(
                    "Too many values for UnInvertedField faceting on field " + field);
              }
              byte[] arr = bytes[doc];
              /*
              for(byte b : arr) {
                //System.out.println("      b=" + Integer.toHexString((int) b));
              }
              */
              bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
                /**
                 * * we don't have to worry about the array getting too large since the "pos" param
                 * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { //
                 * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new
                 * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen
                 * <= pos + len) newlen<<=1; // doubling strategy } **
                 */
                while (newlen <= pos + len) newlen <<= 1; // doubling strategy
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
              }
              System.arraycopy(arr, 0, target, pos, len);
              pos += len + 1; // skip single byte at end and leave it 0 for terminator
            }
          }
        }

        // shrink array
        if (pos < target.length) {
          byte[] newtarget = new byte[pos];
          System.arraycopy(target, 0, newtarget, 0, pos);
          target = newtarget;
        }

        tnums[pass] = target;

        if ((pass << 16) > maxDoc) break;
      }
    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
  }