Java UnicodeUtil 예제들, org.apache.lucene.util.UnicodeUtil Java 예제들

예제 #1

0

파일 보기

파일: SimpleTextFieldsReader.java 프로젝트: naryad/Solr4.0

 @Override
 public int nextDoc() throws IOException {
   if (docID == NO_MORE_DOCS) {
     return docID;
   }
   boolean first = true;
   int termFreq = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       termFreq = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip termFreq++;
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
               || StringHelper.startsWith(scratch, FIELD)
               || StringHelper.startsWith(scratch, END)
           : "scratch=" + scratch.utf8ToString();
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }

예제 #2

0

파일 보기

파일: WFSTCompletionLookup.java 프로젝트: ByteInternet/lucene-solr

  @Override
  public List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num) {
    assert num > 0;
    BytesRef scratch = new BytesRef(key);
    int prefixLength = scratch.length;
    Arc<Long> arc = new Arc<Long>();

    // match the prefix portion exactly
    Long prefixOutput = null;
    try {
      prefixOutput = lookupPrefix(scratch, arc);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }

    if (prefixOutput == null) {
      return Collections.<LookupResult>emptyList();
    }

    List<LookupResult> results = new ArrayList<LookupResult>(num);
    CharsRef spare = new CharsRef();
    if (exactFirst && arc.isFinal()) {
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(
          new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
      if (--num == 0) {
        return results; // that was quick
      }
    }

    // complete top-N
    MinResult<Long> completions[] = null;
    try {
      completions = Util.shortestPaths(fst, arc, weightComparator, num);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }

    BytesRef suffix = new BytesRef(8);
    for (MinResult<Long> completion : completions) {
      scratch.length = prefixLength;
      // append suffix
      Util.toBytesRef(completion.input, suffix);
      scratch.append(suffix);
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(
          new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output)));
    }
    return results;
  }

예제 #3

0

파일 보기

파일: SimpleTextFieldsReader.java 프로젝트: naryad/Solr4.0

 @Override
 public int nextDoc() throws IOException {
   boolean first = true;
   in.seek(nextDocStart);
   long posStart = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     // System.out.println("NEXT DOC: " + scratch.utf8ToString());
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       tf = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       posStart = in.getFilePointer();
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
           || StringHelper.startsWith(scratch, FIELD)
           || StringHelper.startsWith(scratch, END);
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }

예제 #4

0

파일 보기

파일: SimpleTextFieldsReader.java 프로젝트: naryad/Solr4.0

    @Override
    public int nextPosition() throws IOException {
      final int pos;
      if (readPositions) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + POS.length,
            scratch.length - POS.length,
            scratchUTF16_2);
        pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      } else {
        pos = -1;
      }

      if (readOffsets) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, START_OFFSET)
            : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + START_OFFSET.length,
            scratch.length - START_OFFSET.length,
            scratchUTF16_2);
        startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + END_OFFSET.length,
            scratch.length - END_OFFSET.length,
            scratchUTF16_2);
        endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      }

      final long fp = in.getFilePointer();
      SimpleTextUtil.readLine(in, scratch);
      if (StringHelper.startsWith(scratch, PAYLOAD)) {
        final int len = scratch.length - PAYLOAD.length;
        if (scratch2.bytes.length < len) {
          scratch2.grow(len);
        }
        System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len);
        scratch2.length = len;
        payload = scratch2;
      } else {
        payload = null;
        in.seek(fp);
      }
      return pos;
    }

예제 #5

0

파일 보기

파일: TestUTF32ToUTF8.java 프로젝트: simplegeo/lucene-solr

  private void assertAutomaton(Automaton automaton) throws Exception {
    CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
    ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
    final AutomatonTestUtil.RandomAcceptedStrings ras =
        new AutomatonTestUtil.RandomAcceptedStrings(automaton);

    int num = 1000 * RANDOM_MULTIPLIER;
    for (int i = 0; i < num; i++) {
      final String string;
      if (random.nextBoolean()) {
        // likely not accepted
        string = _TestUtil.randomUnicodeString(random);
      } else {
        // will be accepted
        int[] codepoints = ras.getRandomAcceptedString(random);
        try {
          string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
        } catch (Exception e) {
          System.out.println(codepoints.length + " codepoints:");
          for (int j = 0; j < codepoints.length; j++) {
            System.out.println("  " + Integer.toHexString(codepoints[j]));
          }
          throw e;
        }
      }
      byte bytes[] = string.getBytes("UTF-8");
      assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
    }
  }

예제 #6

0

파일 보기

파일: ValueSourceParser.java 프로젝트: smalldirector/heliosearch

  private static TInfo parseTerm(FunctionQParser fp) throws SyntaxError {
    TInfo tinfo = new TInfo();

    tinfo.indexedField = tinfo.field = fp.parseArg();
    tinfo.val = fp.parseArg();
    tinfo.indexedBytes = new BytesRef();

    FieldType ft = fp.getReq().getSchema().getFieldTypeNoEx(tinfo.field);
    if (ft == null) ft = new StrField();

    if (ft instanceof TextField) {
      // need to do analysis on the term
      String indexedVal = tinfo.val;
      Query q =
          ft.getFieldQuery(fp, fp.getReq().getSchema().getFieldOrNull(tinfo.field), tinfo.val);
      if (q instanceof TermQuery) {
        Term term = ((TermQuery) q).getTerm();
        tinfo.indexedField = term.field();
        indexedVal = term.text();
      }
      UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes);
    } else {
      ft.readableToIndexed(tinfo.val, tinfo.indexedBytes);
    }

    return tinfo;
  }

예제 #7

0

파일 보기

파일: DocTermsIndexDocValues.java 프로젝트: naryad/Solr4.0

 @Override
 public String strVal(int doc) {
   int ord = termsIndex.getOrd(doc);
   if (ord == 0) return null;
   termsIndex.lookup(ord, spare);
   UnicodeUtil.UTF8toUTF16(spare, spareChars);
   return spareChars.toString();
 }

예제 #8

0

파일 보기

파일: SortableDoubleField.java 프로젝트: bozydar/solr

 @Override
 public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
   // TODO: this could be more efficient, but the sortable types should be deprecated instead
   UnicodeUtil.UTF8toUTF16(input, charsRef);
   final char[] indexedToReadable = indexedToReadable(charsRef.toString()).toCharArray();
   charsRef.copyChars(indexedToReadable, 0, indexedToReadable.length);
   return charsRef;
 }

예제 #9

0

파일 보기

파일: PagedBytesReference.java 프로젝트: prodigeni/elasticsearch

  @Override
  public String toUtf8() {
    if (length() == 0) {
      return "";
    }

    byte[] bytes = toBytes();
    final CharsRef ref = new CharsRef(length);
    UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref);
    return ref.toString();
  }

예제 #10

0

파일 보기

파일: AutomatonTestUtil.java 프로젝트: ankit-zettata/solr-analytics

 /** Returns random string, including full unicode range. */
 public static String randomRegexp(Random r) {
   while (true) {
     String regexp = randomRegexpString(r);
     // we will also generate some undefined unicode queries
     if (!UnicodeUtil.validUTF16String(regexp)) continue;
     try {
       new RegExp(regexp, RegExp.NONE);
       return regexp;
     } catch (Exception e) {
     }
   }
 }

예제 #11

0

파일 보기

파일: SlowFuzzyTermsEnum.java 프로젝트: rmuir/lucene-solr

    /**
     * Constructor for enumeration of all terms from specified <code>reader</code> which share a
     * prefix of length <code>prefixLength</code> with <code>term</code> and which have a fuzzy
     * similarity &gt; <code>minSimilarity</code>.
     *
     * <p>After calling the constructor the enumeration is already pointing to the first valid term
     * if such a term exists.
     *
     * @throws IOException If there is a low-level I/O error.
     */
    public LinearFuzzyTermsEnum() throws IOException {
      super(terms.iterator());

      this.text = new int[termLength - realPrefixLength];
      System.arraycopy(termText, realPrefixLength, text, 0, text.length);
      final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
      prefixBytesRef = new BytesRef(prefix);
      this.d = new int[this.text.length + 1];
      this.p = new int[this.text.length + 1];

      setInitialSeekTerm(prefixBytesRef);
    }

예제 #12

0

파일 보기

파일: FSTCompletionLookup.java 프로젝트: sdgdsffdsfff/bookcodes

  @Override
  public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) {
    final List<Completion> completions;
    if (higherWeightsFirst) {
      completions = higherWeightsCompletion.lookup(key, num);
    } else {
      completions = normalCompletion.lookup(key, num);
    }

    final ArrayList<LookupResult> results = new ArrayList<LookupResult>(completions.size());
    CharsRef spare = new CharsRef();
    for (Completion c : completions) {
      spare.grow(c.utf8.length);
      UnicodeUtil.UTF8toUTF16(c.utf8, spare);
      results.add(new LookupResult(spare.toString(), c.bucket));
    }
    return results;
  }

예제 #13

0

파일 보기

파일: TestRegexpRandom2.java 프로젝트: naryad/Solr4.0

  @Override
  public void setUp() throws Exception {
    super.setUp();
    dir = newDirectory();
    fieldName =
        random().nextBoolean() ? "field" : ""; // sometimes use an empty string as field name
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random(),
            dir,
            newIndexWriterConfig(
                    TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false))
                .setMaxBufferedDocs(_TestUtil.nextInt(random(), 50, 1000)));
    Document doc = new Document();
    Field field = newStringField(fieldName, "", Field.Store.NO);
    doc.add(field);
    List<String> terms = new ArrayList<String>();
    int num = atLeast(200);
    for (int i = 0; i < num; i++) {
      String s = _TestUtil.randomUnicodeString(random());
      field.setStringValue(s);
      terms.add(s);
      writer.addDocument(doc);
    }

    if (VERBOSE) {
      // utf16 order
      Collections.sort(terms);
      System.out.println("UTF16 order:");
      for (String s : terms) {
        System.out.println("  " + UnicodeUtil.toHexString(s));
      }
    }

    reader = writer.getReader();
    searcher1 = newSearcher(reader);
    searcher2 = newSearcher(reader);
    writer.close();
  }

예제 #14

0

파일 보기

파일: TestRegexpRandom2.java 프로젝트: simplegeo/lucene-solr

  @Override
  public void setUp() throws Exception {
    super.setUp();
    dir = newDirectory();
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random,
            dir,
            newIndexWriterConfig(
                    TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.KEYWORD, false))
                .setMaxBufferedDocs(_TestUtil.nextInt(random, 50, 1000)));

    Document doc = new Document();
    Field field = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
    doc.add(field);
    List<String> terms = new ArrayList<String>();
    int num = 2000 * RANDOM_MULTIPLIER;
    for (int i = 0; i < num; i++) {
      String s = _TestUtil.randomUnicodeString(random);
      field.setValue(s);
      terms.add(s);
      writer.addDocument(doc);
    }

    if (VERBOSE) {
      // utf16 order
      Collections.sort(terms);
      System.out.println("UTF16 order:");
      for (String s : terms) {
        System.out.println("  " + UnicodeUtil.toHexString(s));
      }
    }

    reader = writer.getReader();
    searcher = new IndexSearcher(reader);
    writer.close();
  }

예제 #15

0

파일 보기

파일: TSTLookup.java 프로젝트: jarvisxiong/read-open-source-code

  @Override
  public void build(InputIterator tfit) throws IOException {
    if (tfit.hasPayloads()) {
      throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    root = new TernaryTreeNode();
    // buffer first
    if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {
      // make sure it's sorted and the comparator uses UTF16 sort order
      tfit = new SortedInputIterator(tfit, BytesRef.getUTF8SortedAsUTF16Comparator());
    }

    ArrayList<String> tokens = new ArrayList<String>();
    ArrayList<Number> vals = new ArrayList<Number>();
    BytesRef spare;
    CharsRef charsSpare = new CharsRef();
    while ((spare = tfit.next()) != null) {
      charsSpare.grow(spare.length);
      UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
      tokens.add(charsSpare.toString());
      vals.add(Long.valueOf(tfit.weight()));
    }
    autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
  }

예제 #16

0

파일 보기

파일: IndexFieldData.java 프로젝트: kirims/elasticsearch

 static {
   MAX_TERM = new BytesRef();
   final char[] chars = Character.toChars(Character.MAX_CODE_POINT);
   UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, MAX_TERM);
 }

예제 #17

0

파일 보기

파일: TestRegexpRandom2.java 프로젝트: simplegeo/lucene-solr

 @Override
 protected AcceptStatus accept(BytesRef term) throws IOException {
   UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
   return runAutomaton.run(utf16.result, 0, utf16.length) ? AcceptStatus.YES : AcceptStatus.NO;
 }

예제 #18

0

파일 보기

파일: SimpleTextFieldsReader.java 프로젝트: naryad/Solr4.0

 private void loadTerms() throws IOException {
   PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
   final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b;
   final PairOutputs<Long, Long> outputsInner =
       new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs);
   final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
       new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner);
   b =
       new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>(
           FST.INPUT_TYPE.BYTE1, outputs);
   IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
   in.seek(termsStart);
   final BytesRef lastTerm = new BytesRef(10);
   long lastDocsStart = -1;
   int docFreq = 0;
   long totalTermFreq = 0;
   OpenBitSet visitedDocs = new OpenBitSet();
   final IntsRef scratchIntsRef = new IntsRef();
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
         sumTotalTermFreq += totalTermFreq;
       }
       break;
     } else if (StringHelper.startsWith(scratch, DOC)) {
       docFreq++;
       sumDocFreq++;
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       visitedDocs.set(docID);
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, TERM)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
       }
       lastDocsStart = in.getFilePointer();
       final int len = scratch.length - TERM.length;
       if (len > lastTerm.length) {
         lastTerm.grow(len);
       }
       System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
       lastTerm.length = len;
       docFreq = 0;
       sumTotalTermFreq += totalTermFreq;
       totalTermFreq = 0;
       termCount++;
     }
   }
   docCount = (int) visitedDocs.cardinality();
   fst = b.finish();
   /*
   PrintStream ps = new PrintStream("out.dot");
   fst.toDot(ps);
   ps.close();
   System.out.println("SAVED out.dot");
   */
   // System.out.println("FST " + fst.sizeInBytes());
 }

예제 #19

0

파일 보기

파일: DateField.java 프로젝트: jarvisxiong/read-open-source-code

 @Override
 public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
   UnicodeUtil.UTF8toUTF16(input, charsRef);
   charsRef.append(Z_ARRAY, 0, 1);
   return charsRef;
 }

예제 #20

0

파일 보기

  @Override
  protected Suggest.Suggestion<
          ? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>>
      innerExecute(
          String name,
          CompletionSuggestionContext suggestionContext,
          IndexReader indexReader,
          CharsRef spare)
          throws IOException {
    if (suggestionContext.mapper() == null
        || !(suggestionContext.mapper() instanceof CompletionFieldMapper)) {
      throw new ElasticsearchException(
          "Field [" + suggestionContext.getField() + "] is not a completion suggest field");
    }

    CompletionSuggestion completionSuggestion =
        new CompletionSuggestion(name, suggestionContext.getSize());
    UnicodeUtil.UTF8toUTF16(suggestionContext.getText(), spare);

    CompletionSuggestion.Entry completionSuggestEntry =
        new CompletionSuggestion.Entry(new StringText(spare.toString()), 0, spare.length());
    completionSuggestion.addTerm(completionSuggestEntry);

    String fieldName = suggestionContext.getField();
    Map<String, CompletionSuggestion.Entry.Option> results =
        Maps.newHashMapWithExpectedSize(indexReader.leaves().size() * suggestionContext.getSize());
    for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) {
      AtomicReader atomicReader = atomicReaderContext.reader();
      Terms terms = atomicReader.fields().terms(fieldName);
      if (terms instanceof Completion090PostingsFormat.CompletionTerms) {
        final Completion090PostingsFormat.CompletionTerms lookupTerms =
            (Completion090PostingsFormat.CompletionTerms) terms;
        final Lookup lookup = lookupTerms.getLookup(suggestionContext.mapper(), suggestionContext);
        if (lookup == null) {
          // we don't have a lookup for this segment.. this might be possible if a merge dropped all
          // docs from the segment that had a value in this segment.
          continue;
        }
        List<Lookup.LookupResult> lookupResults =
            lookup.lookup(spare, false, suggestionContext.getSize());
        for (Lookup.LookupResult res : lookupResults) {

          final String key = res.key.toString();
          final float score = res.value;
          final Option value = results.get(key);
          if (value == null) {
            final Option option =
                new CompletionSuggestion.Entry.Option(
                    new StringText(key),
                    score,
                    res.payload == null ? null : new BytesArray(res.payload));
            results.put(key, option);
          } else if (value.getScore() < score) {
            value.setScore(score);
            value.setPayload(res.payload == null ? null : new BytesArray(res.payload));
          }
        }
      }
    }
    final List<CompletionSuggestion.Entry.Option> options =
        new ArrayList<CompletionSuggestion.Entry.Option>(results.values());
    CollectionUtil.introSort(options, scoreComparator);

    int optionCount = Math.min(suggestionContext.getSize(), options.size());
    for (int i = 0; i < optionCount; i++) {
      completionSuggestEntry.addOption(options.get(i));
    }

    return completionSuggestion;
  }

예제 #21

0

파일 보기

파일: TestUTF32ToUTF8.java 프로젝트: simplegeo/lucene-solr

 private boolean matches(ByteRunAutomaton a, int code) {
   char[] chars = Character.toChars(code);
   UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
   return a.run(b.bytes, 0, b.length);
 }

예제 #22

0

파일 보기

파일: MultiTermHighlighting.java 프로젝트: markrmiller/lucene-solr-svn2git

  /**
   * Extracts all MultiTermQueries for {@code field}, and returns equivalent automata that will
   * match terms.
   */
  static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
    List<CharacterRunAutomaton> list = new ArrayList<>();
    if (query instanceof BooleanQuery) {
      for (BooleanClause clause : (BooleanQuery) query) {
        if (!clause.isProhibited()) {
          list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
        }
      }
    } else if (query instanceof ConstantScoreQuery) {
      list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field)));
    } else if (query instanceof DisjunctionMaxQuery) {
      for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
      }
    } else if (query instanceof SpanOrQuery) {
      for (Query sub : ((SpanOrQuery) query).getClauses()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
      }
    } else if (query instanceof SpanNearQuery) {
      for (Query sub : ((SpanNearQuery) query).getClauses()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
      }
    } else if (query instanceof SpanNotQuery) {
      list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field)));
    } else if (query instanceof SpanPositionCheckQuery) {
      list.addAll(
          Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
    } else if (query instanceof SpanMultiTermQueryWrapper) {
      list.addAll(
          Arrays.asList(
              extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
    } else if (query instanceof AutomatonQuery) {
      final AutomatonQuery aq = (AutomatonQuery) query;
      if (aq.getField().equals(field)) {
        list.add(
            new CharacterRunAutomaton(aq.getAutomaton()) {
              @Override
              public String toString() {
                return aq.toString();
              }
            });
      }
    } else if (query instanceof PrefixQuery) {
      final PrefixQuery pq = (PrefixQuery) query;
      Term prefix = pq.getPrefix();
      if (prefix.field().equals(field)) {
        list.add(
            new CharacterRunAutomaton(
                Operations.concatenate(
                    Automata.makeString(prefix.text()), Automata.makeAnyString())) {
              @Override
              public String toString() {
                return pq.toString();
              }
            });
      }
    } else if (query instanceof FuzzyQuery) {
      final FuzzyQuery fq = (FuzzyQuery) query;
      if (fq.getField().equals(field)) {
        String utf16 = fq.getTerm().text();
        int termText[] = new int[utf16.codePointCount(0, utf16.length())];
        for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
          termText[j++] = cp = utf16.codePointAt(i);
        }
        int termLength = termText.length;
        int prefixLength = Math.min(fq.getPrefixLength(), termLength);
        String suffix =
            UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
        LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
        String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
        Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix);
        list.add(
            new CharacterRunAutomaton(automaton) {
              @Override
              public String toString() {
                return fq.toString();
              }
            });
      }
    } else if (query instanceof TermRangeQuery) {
      final TermRangeQuery tq = (TermRangeQuery) query;
      if (tq.getField().equals(field)) {
        final CharsRef lowerBound;
        if (tq.getLowerTerm() == null) {
          lowerBound = null;
        } else {
          lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
        }

        final CharsRef upperBound;
        if (tq.getUpperTerm() == null) {
          upperBound = null;
        } else {
          upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
        }

        final boolean includeLower = tq.includesLower();
        final boolean includeUpper = tq.includesUpper();
        final CharsRef scratch = new CharsRef();
        final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();

        // this is *not* an automaton, but it's very simple
        list.add(
            new CharacterRunAutomaton(Automata.makeEmpty()) {
              @Override
              public boolean run(char[] s, int offset, int length) {
                scratch.chars = s;
                scratch.offset = offset;
                scratch.length = length;

                if (lowerBound != null) {
                  int cmp = comparator.compare(scratch, lowerBound);
                  if (cmp < 0 || (!includeLower && cmp == 0)) {
                    return false;
                  }
                }

                if (upperBound != null) {
                  int cmp = comparator.compare(scratch, upperBound);
                  if (cmp > 0 || (!includeUpper && cmp == 0)) {
                    return false;
                  }
                }
                return true;
              }

              @Override
              public String toString() {
                return tq.toString();
              }
            });
      }
    }
    return list.toArray(new CharacterRunAutomaton[list.size()]);
  }

예제 #23

0

파일 보기

파일: TestMultiFields.java 프로젝트: joseerlang/Lucene-solr

  public void testRandom() throws Exception {

    int num = atLeast(2);
    for (int iter = 0; iter < num; iter++) {
      if (VERBOSE) {
        System.out.println("TEST: iter=" + iter);
      }

      Directory dir = newDirectory();

      IndexWriter w =
          new IndexWriter(
              dir,
              newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                  .setMergePolicy(NoMergePolicy.COMPOUND_FILES));
      _TestUtil.keepFullyDeletedSegments(w);

      Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>();
      Set<Integer> deleted = new HashSet<Integer>();
      List<BytesRef> terms = new ArrayList<BytesRef>();

      int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER);
      Document doc = new Document();
      Field f = newStringField("field", "", Field.Store.NO);
      doc.add(f);
      Field id = newStringField("id", "", Field.Store.NO);
      doc.add(id);

      boolean onlyUniqueTerms = random().nextBoolean();
      if (VERBOSE) {
        System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs);
      }
      Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
      for (int i = 0; i < numDocs; i++) {

        if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) {
          // re-use existing term
          BytesRef term = terms.get(random().nextInt(terms.size()));
          docs.get(term).add(i);
          f.setStringValue(term.utf8ToString());
        } else {
          String s = _TestUtil.randomUnicodeString(random(), 10);
          BytesRef term = new BytesRef(s);
          if (!docs.containsKey(term)) {
            docs.put(term, new ArrayList<Integer>());
          }
          docs.get(term).add(i);
          terms.add(term);
          uniqueTerms.add(term);
          f.setStringValue(s);
        }
        id.setStringValue("" + i);
        w.addDocument(doc);
        if (random().nextInt(4) == 1) {
          w.commit();
        }
        if (i > 0 && random().nextInt(20) == 1) {
          int delID = random().nextInt(i);
          deleted.add(delID);
          w.deleteDocuments(new Term("id", "" + delID));
          if (VERBOSE) {
            System.out.println("TEST: delete " + delID);
          }
        }
      }

      if (VERBOSE) {
        List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms);
        Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator());
        System.out.println("TEST: terms in UTF16 order:");
        for (BytesRef b : termsList) {
          System.out.println("  " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b);
          for (int docID : docs.get(b)) {
            if (deleted.contains(docID)) {
              System.out.println("    " + docID + " (deleted)");
            } else {
              System.out.println("    " + docID);
            }
          }
        }
      }

      IndexReader reader = w.getReader();
      w.close();
      if (VERBOSE) {
        System.out.println("TEST: reader=" + reader);
      }

      Bits liveDocs = MultiFields.getLiveDocs(reader);
      for (int delDoc : deleted) {
        assertFalse(liveDocs.get(delDoc));
      }

      for (int i = 0; i < 100; i++) {
        BytesRef term = terms.get(random().nextInt(terms.size()));
        if (VERBOSE) {
          System.out.println(
              "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term);
        }

        DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0);
        assertNotNull(docsEnum);

        for (int docID : docs.get(term)) {
          if (!deleted.contains(docID)) {
            assertEquals(docID, docsEnum.nextDoc());
          }
        }
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc());
      }

      reader.close();
      dir.close();
    }
  }

예제 #24

0

파일 보기

파일: Strings.java 프로젝트: rclaasen/elasticsearch

 public static byte[] toUTF8Bytes(CharSequence charSequence, BytesRef spare) {
   UnicodeUtil.UTF16toUTF8(charSequence, 0, charSequence.length(), spare);
   final byte[] bytes = new byte[spare.length];
   System.arraycopy(spare.bytes, spare.offset, bytes, 0, bytes.length);
   return bytes;
 }