Ejemplo n.º 1
0
 @Override
 public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
   if (include != null) {
     builder.field(INCLUDE_FIELD.getPreferredName(), include.getOriginalString());
   } else if (includeValues != null) {
     builder.startArray(INCLUDE_FIELD.getPreferredName());
     for (BytesRef value : includeValues) {
       builder.value(value.utf8ToString());
     }
     builder.endArray();
   } else if (isPartitionBased()) {
     builder.startObject(INCLUDE_FIELD.getPreferredName());
     builder.field(PARTITION_FIELD.getPreferredName(), incZeroBasedPartition);
     builder.field(NUM_PARTITIONS_FIELD.getPreferredName(), incNumPartitions);
     builder.endObject();
   }
   if (exclude != null) {
     builder.field(EXCLUDE_FIELD.getPreferredName(), exclude.getOriginalString());
   } else if (excludeValues != null) {
     builder.startArray(EXCLUDE_FIELD.getPreferredName());
     for (BytesRef value : excludeValues) {
       builder.value(value.utf8ToString());
     }
     builder.endArray();
   }
   return builder;
 }
 @Test
 public void testNoCopy() throws Exception {
   BytesRef ref = new BytesRef("i do not want to be copied!");
   BytesRef sub1 = SubstrFunction.substring(ref, 0, 10);
   BytesRef sub2 = SubstrFunction.substring(ref, 5, 14);
   assertThat(sub1.utf8ToString(), is("i do not w"));
   assertThat(sub2.utf8ToString(), is("not want "));
   assertThat(ref.bytes, allOf(is(sub2.bytes), is(sub1.bytes)));
 }
  @Test
  public void testTokenStream() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    ContextSuggestField field =
        new ContextSuggestField("field", "input", 1, "context1", "context2");
    BytesRef surfaceForm = new BytesRef("input");
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
      output.writeVInt(surfaceForm.length);
      output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
      output.writeVInt(1 + 1);
      output.writeByte(ContextSuggestField.TYPE);
    }
    BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
    String[] expectedOutputs = new String[2];
    CharsRefBuilder builder = new CharsRefBuilder();
    builder.append("context1");
    builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("input");
    expectedOutputs[0] = builder.toCharsRef().toString();
    builder.clear();
    builder.append("context2");
    builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("input");
    expectedOutputs[1] = builder.toCharsRef().toString();
    TokenStream stream =
        new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(
            field.tokenStream(analyzer, null));
    assertTokenStreamContents(
        stream,
        expectedOutputs,
        null,
        null,
        new String[] {payload.utf8ToString(), payload.utf8ToString()},
        new int[] {1, 1},
        null,
        null);

    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
    stream =
        new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(
            field.tokenStream(completionAnalyzer, null));
    assertTokenStreamContents(
        stream,
        expectedOutputs,
        null,
        null,
        new String[] {payload.utf8ToString(), payload.utf8ToString()},
        new int[] {1, 1},
        null,
        null);
  }
Ejemplo n.º 4
0
    @Override
    public int nextPosition() throws IOException {
      final int pos;
      if (readPositions) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + POS.length,
            scratch.length - POS.length,
            scratchUTF16_2);
        pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      } else {
        pos = -1;
      }

      if (readOffsets) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, START_OFFSET)
            : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + START_OFFSET.length,
            scratch.length - START_OFFSET.length,
            scratchUTF16_2);
        startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + END_OFFSET.length,
            scratch.length - END_OFFSET.length,
            scratchUTF16_2);
        endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      }

      final long fp = in.getFilePointer();
      SimpleTextUtil.readLine(in, scratch);
      if (StringHelper.startsWith(scratch, PAYLOAD)) {
        final int len = scratch.length - PAYLOAD.length;
        if (scratch2.bytes.length < len) {
          scratch2.grow(len);
        }
        System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len);
        scratch2.length = len;
        payload = scratch2;
      } else {
        payload = null;
        in.seek(fp);
      }
      return pos;
    }
    @Override
    public void hitExecute(SearchContext context, HitContext hitContext) {
      if (context.getFetchSubPhaseContext(CONTEXT_FACTORY).hitExecutionNeeded() == false) {
        return;
      }
      String field = context.getFetchSubPhaseContext(CONTEXT_FACTORY).getField();

      if (hitContext.hit().fieldsOrNull() == null) {
        hitContext.hit().fields(new HashMap<>());
      }
      SearchHitField hitField = hitContext.hit().fields().get(NAMES[0]);
      if (hitField == null) {
        hitField = new InternalSearchHitField(NAMES[0], new ArrayList<>(1));
        hitContext.hit().fields().put(NAMES[0], hitField);
      }
      TermVectorsResponse termVector =
          TermVectorsService.getTermVectors(
              context.indexShard(),
              new TermVectorsRequest(
                  context.indexShard().shardId().getIndex().getName(),
                  hitContext.hit().type(),
                  hitContext.hit().id()));
      try {
        Map<String, Integer> tv = new HashMap<>();
        TermsEnum terms = termVector.getFields().terms(field).iterator();
        BytesRef term;
        while ((term = terms.next()) != null) {
          tv.put(term.utf8ToString(), terms.postings(null, PostingsEnum.ALL).freq());
        }
        hitField.values().add(tv);
      } catch (IOException e) {
        ESLoggerFactory.getLogger(FetchSubPhasePluginIT.class.getName())
            .info("Swallowed exception", e);
      }
    }
Ejemplo n.º 6
0
 /**
  * Find terms in the index based on a prefix. Useful for autocomplete.
  *
  * @param index the index
  * @param fieldName the field
  * @param prefix the prefix we're looking for (null or empty string for all terms)
  * @param sensitive match case-sensitively or not?
  * @param maxResults max. number of results to return (or -1 for all)
  * @return the matching terms
  */
 public static List<String> findTermsByPrefix(
     LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) {
   boolean allTerms = prefix == null || prefix.length() == 0;
   if (allTerms) {
     prefix = "";
     sensitive = true; // don't do unnecessary work in this case
   }
   try {
     if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase();
     org.apache.lucene.index.Terms terms = index.terms(fieldName);
     List<String> results = new ArrayList<>();
     TermsEnum termsEnum = terms.iterator();
     BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET));
     termsEnum.seekCeil(brPrefix); // find the prefix in the terms list
     while (maxResults < 0 || results.size() < maxResults) {
       BytesRef term = termsEnum.next();
       if (term == null) break;
       String termText = term.utf8ToString();
       String optDesensitized = termText;
       if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase();
       if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) {
         // Doesn't match prefix or different field; no more matches
         break;
       }
       // Match, add term
       results.add(termText);
     }
     return results;
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
Ejemplo n.º 7
0
 static void writeFieldVal(BytesRef val, FieldType ft, Appendable out, int flags)
     throws IOException {
   if (ft != null) {
     try {
       CharsRef readable = new CharsRef();
       ft.indexedToReadable(val, readable);
       out.append(readable);
     } catch (Exception e) {
       out.append("EXCEPTION(val=");
       out.append(val.utf8ToString());
       out.append(")");
     }
   } else {
     out.append(val.utf8ToString());
   }
 }
Ejemplo n.º 8
0
 @Override
 public int nextDoc() throws IOException {
   if (docID == NO_MORE_DOCS) {
     return docID;
   }
   boolean first = true;
   int termFreq = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       termFreq = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip termFreq++;
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
               || StringHelper.startsWith(scratch, FIELD)
               || StringHelper.startsWith(scratch, END)
           : "scratch=" + scratch.utf8ToString();
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }
Ejemplo n.º 9
0
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    BytesRef bytes = termsEnum.next();
    if (bytes == null) return false;
    charTerm.setEmpty();
    charTerm.append(bytes.utf8ToString());
    return true;
  }
 // for debugging
 @SuppressWarnings("unused")
 static String brToString(BytesRef b) {
   try {
     return b.utf8ToString() + " " + b;
   } catch (Throwable t) {
     // If BytesRef isn't actually UTF8, or it's eg a
     // prefix of UTF8 that ends mid-unicode-char, we
     // fallback to hex:
     return b.toString();
   }
 }
Ejemplo n.º 11
0
  public LongFilter convertToLongFilter(DocValueFormat format) {

    if (isPartitionBased()) {
      return new PartitionedLongFilter();
    }

    int numValids = includeValues == null ? 0 : includeValues.size();
    int numInvalids = excludeValues == null ? 0 : excludeValues.size();
    SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids);
    if (includeValues != null) {
      for (BytesRef val : includeValues) {
        result.addAccept(format.parseLong(val.utf8ToString(), false, null));
      }
    }
    if (excludeValues != null) {
      for (BytesRef val : excludeValues) {
        result.addReject(format.parseLong(val.utf8ToString(), false, null));
      }
    }
    return result;
  }
  @Test
  public void testTokenStream() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    SuggestField suggestField = new SuggestField("field", "input", 1);
    BytesRef surfaceForm = new BytesRef("input");
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
      output.writeVInt(surfaceForm.length);
      output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
      output.writeVInt(1 + 1);
      output.writeByte(SuggestField.TYPE);
    }
    BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
    TokenStream stream =
        new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(
            suggestField.tokenStream(analyzer, null));
    assertTokenStreamContents(
        stream,
        new String[] {"input"},
        null,
        null,
        new String[] {payload.utf8ToString()},
        new int[] {1},
        null,
        null);

    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
    stream =
        new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(
            suggestField.tokenStream(completionAnalyzer, null));
    assertTokenStreamContents(
        stream,
        new String[] {"input"},
        null,
        null,
        new String[] {payload.utf8ToString()},
        new int[] {1},
        null,
        null);
  }
Ejemplo n.º 13
0
 @Override
 public Explanation explain(LeafReaderContext context, int doc) throws IOException {
   SortedDocValues values = DocValues.getSorted(context.reader(), joinField);
   if (values != null) {
     int segmentOrd = values.getOrd(doc);
     if (segmentOrd != -1) {
       BytesRef joinValue = values.lookupOrd(segmentOrd);
       return Explanation.match(
           queryNorm, "Score based on join value " + joinValue.utf8ToString());
     }
   }
   return Explanation.noMatch("Not a match");
 }
 public AggregatorValueProc(
     LongIntOpenHashMap facets, Set<BytesRef> excluded, SearchScript script) {
   super(facets);
   this.script = script;
   if (excluded == null || excluded.isEmpty()) {
     this.excluded = null;
   } else {
     this.excluded = new LongOpenHashSet(excluded.size());
     for (BytesRef s : excluded) {
       this.excluded.add(Long.parseLong(s.utf8ToString()));
     }
   }
 }
Ejemplo n.º 15
0
  public LongFilter convertToDoubleFilter() {
    if (isPartitionBased()) {
      return new PartitionedLongFilter();
    }

    int numValids = includeValues == null ? 0 : includeValues.size();
    int numInvalids = excludeValues == null ? 0 : excludeValues.size();
    SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids);
    if (includeValues != null) {
      for (BytesRef val : includeValues) {
        double dval = Double.parseDouble(val.utf8ToString());
        result.addAccept(NumericUtils.doubleToSortableLong(dval));
      }
    }
    if (excludeValues != null) {
      for (BytesRef val : excludeValues) {
        double dval = Double.parseDouble(val.utf8ToString());
        result.addReject(NumericUtils.doubleToSortableLong(dval));
      }
    }
    return result;
  }
Ejemplo n.º 16
0
 private static SortedSet<BytesRef> parseForDocValues(
     SortedSet<BytesRef> endUserFormattedValues, DocValueFormat format) {
   SortedSet<BytesRef> result = endUserFormattedValues;
   if (endUserFormattedValues != null) {
     if (format != DocValueFormat.RAW) {
       result = new TreeSet<>();
       for (BytesRef formattedVal : endUserFormattedValues) {
         result.add(format.parseBytesRef(formattedVal.utf8ToString()));
       }
     }
   }
   return result;
 }
  @Override
  protected ShardTermlistResponse shardOperation(ShardTermlistRequest request)
      throws ElasticSearchException {
    synchronized (termlistMutex) {
      InternalIndexShard indexShard =
          (InternalIndexShard)
              indicesService.indexServiceSafe(request.index()).shardSafe(request.shardId());
      indexShard.store().directory();
      Engine.Searcher searcher = indexShard.searcher();
      try {
        Set<String> set = new CompactHashSet();

        Fields fields = MultiFields.getFields(searcher.reader());
        if (fields != null) {
          for (Iterator<String> it = fields.iterator(); it.hasNext(); ) {
            String field = it.next();
            if (field.charAt(0) == '_') {
              continue;
            }
            if (request.getField() == null || field.equals(request.getField())) {
              Terms terms = fields.terms(field);
              if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                  set.add(text.utf8ToString());
                  System.out.println("field=" + field + "; text=" + text.utf8ToString());
                }
              }
            }
          }
        }
        return new ShardTermlistResponse(request.index(), request.shardId(), set);
      } catch (IOException ex) {
        throw new ElasticSearchException(ex.getMessage(), ex);
      }
    }
  }
 // for debugging
 String brToString(BytesRef b) {
   if (b == null) {
     return "null";
   } else {
     try {
       return b.utf8ToString() + " " + b;
     } catch (Throwable t) {
       // If BytesRef isn't actually UTF8, or it's eg a
       // prefix of UTF8 that ends mid-unicode-char, we
       // fallback to hex:
       return b.toString();
     }
   }
 }
 /**
  * Updates a previous suggestion, matching the exact same text as before. Use this to change the
  * weight or payload of an already added suggstion. If you know this text is not already present
  * you can use {@link #add} instead. After adding or updating a batch of new suggestions, you must
  * call {@link #refresh} in the end in order to see the suggestions in {@link #lookup}
  */
 public void update(BytesRef text, long weight, BytesRef payload) throws IOException {
   String textString = text.utf8ToString();
   Document doc = new Document();
   FieldType ft = getTextFieldType();
   doc.add(new Field(TEXT_FIELD_NAME, textString, ft));
   doc.add(new Field("textgrams", textString, ft));
   doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO));
   doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text));
   doc.add(new NumericDocValuesField("weight", weight));
   if (payload != null) {
     doc.add(new BinaryDocValuesField("payloads", payload));
   }
   writer.updateDocument(new Term(EXACT_TEXT_FIELD_NAME, textString), doc);
 }
  /**
   * Create the results based on the search hits. Can be overridden by subclass to add particular
   * behavior (e.g. weight transformation)
   *
   * @throws IOException If there are problems reading fields from the underlying Lucene index.
   */
  protected List<LookupResult> createResults(
      IndexSearcher searcher,
      TopFieldDocs hits,
      int num,
      CharSequence charSequence,
      boolean doHighlight,
      Set<String> matchedTokens,
      String prefixToken)
      throws IOException {

    BinaryDocValues textDV =
        MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);

    // This will just be null if app didn't pass payloads to build():
    // TODO: maybe just stored fields?  they compress...
    BinaryDocValues payloadsDV =
        MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
    List<LookupResult> results = new ArrayList<>();
    BytesRef scratch = new BytesRef();
    for (int i = 0; i < hits.scoreDocs.length; i++) {
      FieldDoc fd = (FieldDoc) hits.scoreDocs[i];
      textDV.get(fd.doc, scratch);
      String text = scratch.utf8ToString();
      long score = (Long) fd.fields[0];

      BytesRef payload;
      if (payloadsDV != null) {
        payload = new BytesRef();
        payloadsDV.get(fd.doc, payload);
      } else {
        payload = null;
      }

      LookupResult result;

      if (doHighlight) {
        Object highlightKey = highlight(text, matchedTokens, prefixToken);
        result = new LookupResult(highlightKey.toString(), highlightKey, score, payload);
      } else {
        result = new LookupResult(text, score, payload);
      }

      results.add(result);
    }

    return results;
  }
  public void testNonRootFloorBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 128; i++) {
      Document doc = new Document();
      String term = "m" + (char) i;
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
      }
      doc.add(newStringField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    BytesRef term;
    int ord = 0;
    while ((term = te.next()) != null) {
      if (VERBOSE) {
        System.out.println("TEST: " + te.ord() + ": " + term.utf8ToString());
      }
      assertEquals(ord, te.ord());
      ord++;
    }

    testEnum(te, terms);

    r.close();
    w.close();
    dir.close();
  }
 void dump(PrintStream out) {
   out.println(field + ":");
   final BytesRef ref = new BytesRef();
   for (int i = 0; i < terms.size(); i++) {
     terms.get(ords[i], ref);
     out.print(ref + " " + ref.utf8ToString() + " ");
     try {
       out.print(Long.toHexString(LegacyNumericUtils.prefixCodedToLong(ref)) + "L");
     } catch (Exception e) {
       try {
         out.print(Integer.toHexString(LegacyNumericUtils.prefixCodedToInt(ref)) + "i");
       } catch (Exception ee) {
       }
     }
     out.println(" score=" + scores[ords[i]]);
     out.println("");
   }
 }
 @Override
 public Explanation explain(LeafReaderContext context, int doc) throws IOException {
   SortedDocValues values = DocValues.getSorted(context.reader(), joinField);
   if (values != null) {
     int segmentOrd = values.getOrd(doc);
     if (segmentOrd != -1) {
       final float score;
       if (globalOrds != null) {
         long globalOrd = globalOrds.getGlobalOrds(context.ord).get(segmentOrd);
         score = collector.score((int) globalOrd);
       } else {
         score = collector.score(segmentOrd);
       }
       BytesRef joinValue = values.lookupOrd(segmentOrd);
       return Explanation.match(score, "Score based on join value " + joinValue.utf8ToString());
     }
   }
   return Explanation.noMatch("Not a match");
 }
Ejemplo n.º 24
0
  // Produces a realistic unicode random string that
  // survives MockAnalyzer unchanged:
  private String getRandomTerm(String other) throws IOException {
    Analyzer a = new MockAnalyzer(random());
    while (true) {
      String s = _TestUtil.randomRealisticUnicodeString(random());
      if (other != null && s.equals(other)) {
        continue;
      }
      IOException priorException = null;
      TokenStream ts = a.tokenStream("foo", s);
      try {
        final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
        final BytesRef termBytes = termAtt.getBytesRef();
        ts.reset();

        int count = 0;
        boolean changed = false;

        while (ts.incrementToken()) {
          termAtt.fillBytesRef();
          if (count == 0 && !termBytes.utf8ToString().equals(s)) {
            // The value was changed during analysis.  Keep iterating so the
            // tokenStream is exhausted.
            changed = true;
          }
          count++;
        }

        ts.end();
        // Did we iterate just once and the value was unchanged?
        if (!changed && count == 1) {
          return s;
        }
      } catch (IOException e) {
        priorException = e;
      } finally {
        IOUtils.closeWhileHandlingException(priorException, ts);
      }
    }
  }
Ejemplo n.º 25
0
  /** tests a pre-intersected automaton against the original */
  public void testFiniteVersusInfinite() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      final List<BytesRef> matchedTerms = new ArrayList<BytesRef>();
      for (BytesRef t : terms) {
        if (BasicOperations.run(automaton, t.utf8ToString())) {
          matchedTerms.add(t);
        }
      }

      Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
      // System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + "
      // states, sigma=" + alternate.getStartPoints().length);
      // AutomatonTestUtil.minimizeSimple(alternate);
      // System.out.println("minmize done");
      AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton);
      AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate);
      CheckHits.checkEqual(
          a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs);
    }
  }
Ejemplo n.º 26
0
  /** seeks to every term accepted by some automata */
  public void testSeeking() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);
      ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms);
      Collections.shuffle(unsortedTerms, random());

      for (BytesRef term : unsortedTerms) {
        if (BasicOperations.run(automaton, term.utf8ToString())) {
          // term is accepted
          if (random().nextBoolean()) {
            // seek exact
            assertTrue(te.seekExact(term, random().nextBoolean()));
          } else {
            // seek ceil
            assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean()));
            assertEquals(term, te.term());
          }
        }
      }
    }
  }
  public void testFloorBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    for (int i = 0; i < 128; i++) {
      Document doc = new Document();
      String term = "" + (char) i;
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
      }
      doc.add(newStringField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    if (VERBOSE) {
      BytesRef term;
      while ((term = te.next()) != null) {
        System.out.println("  " + te.ord() + ": " + term.utf8ToString());
      }
    }

    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(97, te.ord());

    te.seekExact(98);
    assertEquals(new BytesRef("b"), te.term());

    assertTrue(te.seekExact(new BytesRef("z")));
    assertEquals(122, te.ord());

    r.close();
    w.close();
    dir.close();
  }
Ejemplo n.º 28
0
  @Override
  public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
      throws IOException {
    int prefixLength = prefix.length();
    Terms terms = MultiFields.getTerms(reader, fieldName);
    if (terms != null) {
      Matcher matcher = pattern.matcher("");
      try {
        TermsEnum termsEnum = terms.iterator(null);

        TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
        BytesRef text;
        if (status == TermsEnum.SeekStatus.FOUND) {
          text = prefixRef;
        } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
          text = termsEnum.term();
        } else {
          text = null;
        }

        while (text != null) {
          if (text != null && StringHelper.startsWith(text, prefixRef)) {
            String textString = text.utf8ToString();
            matcher.reset(textString.substring(prefixLength));
            if (matcher.matches()) {
              mtv.visitMatchingTerm(new Term(fieldName, textString));
            }
          } else {
            break;
          }
          text = termsEnum.next();
        }
      } finally {
        matcher.reset();
      }
    }
  }
Ejemplo n.º 29
0
  public void testRandom() throws Exception {

    int num = atLeast(2);
    for (int iter = 0; iter < num; iter++) {
      if (VERBOSE) {
        System.out.println("TEST: iter=" + iter);
      }

      Directory dir = newDirectory();

      IndexWriter w =
          new IndexWriter(
              dir,
              newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                  .setMergePolicy(NoMergePolicy.COMPOUND_FILES));
      _TestUtil.keepFullyDeletedSegments(w);

      Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>();
      Set<Integer> deleted = new HashSet<Integer>();
      List<BytesRef> terms = new ArrayList<BytesRef>();

      int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER);
      Document doc = new Document();
      Field f = newStringField("field", "", Field.Store.NO);
      doc.add(f);
      Field id = newStringField("id", "", Field.Store.NO);
      doc.add(id);

      boolean onlyUniqueTerms = random().nextBoolean();
      if (VERBOSE) {
        System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs);
      }
      Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
      for (int i = 0; i < numDocs; i++) {

        if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) {
          // re-use existing term
          BytesRef term = terms.get(random().nextInt(terms.size()));
          docs.get(term).add(i);
          f.setStringValue(term.utf8ToString());
        } else {
          String s = _TestUtil.randomUnicodeString(random(), 10);
          BytesRef term = new BytesRef(s);
          if (!docs.containsKey(term)) {
            docs.put(term, new ArrayList<Integer>());
          }
          docs.get(term).add(i);
          terms.add(term);
          uniqueTerms.add(term);
          f.setStringValue(s);
        }
        id.setStringValue("" + i);
        w.addDocument(doc);
        if (random().nextInt(4) == 1) {
          w.commit();
        }
        if (i > 0 && random().nextInt(20) == 1) {
          int delID = random().nextInt(i);
          deleted.add(delID);
          w.deleteDocuments(new Term("id", "" + delID));
          if (VERBOSE) {
            System.out.println("TEST: delete " + delID);
          }
        }
      }

      if (VERBOSE) {
        List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms);
        Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator());
        System.out.println("TEST: terms in UTF16 order:");
        for (BytesRef b : termsList) {
          System.out.println("  " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b);
          for (int docID : docs.get(b)) {
            if (deleted.contains(docID)) {
              System.out.println("    " + docID + " (deleted)");
            } else {
              System.out.println("    " + docID);
            }
          }
        }
      }

      IndexReader reader = w.getReader();
      w.close();
      if (VERBOSE) {
        System.out.println("TEST: reader=" + reader);
      }

      Bits liveDocs = MultiFields.getLiveDocs(reader);
      for (int delDoc : deleted) {
        assertFalse(liveDocs.get(delDoc));
      }

      for (int i = 0; i < 100; i++) {
        BytesRef term = terms.get(random().nextInt(terms.size()));
        if (VERBOSE) {
          System.out.println(
              "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term);
        }

        DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0);
        assertNotNull(docsEnum);

        for (int docID : docs.get(term)) {
          if (!deleted.contains(docID)) {
            assertEquals(docID, docsEnum.nextDoc());
          }
        }
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc());
      }

      reader.close();
      dir.close();
    }
  }
Ejemplo n.º 30
0
 @Override
 public Object toObject() {
   return exists ? value.utf8ToString() : null;
 }