Ejemplo n.º 1
0
  private Token nextToken(Token reusableToken) throws IOException {
    assert reusableToken != null;

    // 先使用上次留下来的。
    Token nextToken = tokenQueue.poll();
    if (nextToken != null) {
      return nextToken;
    }

    /*//在 TokenUtils.nextToken 已经调用了 inc
    if(!input.incrementToken()) {
    	return null;
    }*/

    /*TermAttribute termAtt = (TermAttribute)input.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute)input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute)input.getAttribute(TypeAttribute.class);

    nextToken = reusableToken.reinit(termAtt.termBuffer(), 0, termAtt.termLength(), offsetAtt.startOffset(), offsetAtt.endOffset(), typeAtt.type());*/

    nextToken = TokenUtils.nextToken(input, reusableToken);

    if (nextToken != null
        && (Word.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type())
            || Word.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type()))) {
      final char[] buffer = nextToken.buffer();
      final int length = nextToken.length();
      byte lastType = (byte) Character.getType(buffer[0]); // 与上次的字符是否同类
      int termBufferOffset = 0;
      int termBufferLength = 0;
      for (int i = 0; i < length; i++) {
        byte type = (byte) Character.getType(buffer[i]);
        if (type <= Character.MODIFIER_LETTER) {
          type = Character.LOWERCASE_LETTER;
        }
        if (type != lastType) { // 与上一次的不同
          addToken(nextToken, termBufferOffset, termBufferLength, lastType);

          termBufferOffset += termBufferLength;
          termBufferLength = 0;

          lastType = type;
        }

        termBufferLength++;
      }
      if (termBufferLength > 0) { // 最后一次
        addToken(nextToken, termBufferOffset, termBufferLength, lastType);
      }
      nextToken = tokenQueue.poll();
    }

    return nextToken;
  }
Ejemplo n.º 2
0
 public boolean incrementToken() throws IOException {
   clearAttributes();
   Token token = nextToken(reusableToken);
   if (token != null) {
     termAtt.copyBuffer(token.buffer(), 0, token.length());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     typeAtt.setType(token.type());
     return true;
   } else {
     end();
     return false;
   }
 }
Ejemplo n.º 3
0
  /**
   * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
   * the tokens end up at the same position.
   *
   * <p>Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same
   * position) Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a
   * has posInc=n)
   */
  public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
    ArrayList<Token> result = new ArrayList<Token>();
    if (lst1 == null || lst2 == null) {
      if (lst2 != null) result.addAll(lst2);
      if (lst1 != null) result.addAll(lst1);
      return result;
    }

    int pos = 0;
    Iterator<Token> iter1 = lst1.iterator();
    Iterator<Token> iter2 = lst2.iterator();
    Token tok1 = iter1.hasNext() ? iter1.next() : null;
    Token tok2 = iter2.hasNext() ? iter2.next() : null;
    int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
    int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
    while (tok1 != null || tok2 != null) {
      while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
        Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
        tok.copyBuffer(tok1.buffer(), 0, tok1.length());
        tok.setPositionIncrement(pos1 - pos);
        result.add(tok);
        pos = pos1;
        tok1 = iter1.hasNext() ? iter1.next() : null;
        pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
      }
      while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
        Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
        tok.copyBuffer(tok2.buffer(), 0, tok2.length());
        tok.setPositionIncrement(pos2 - pos);
        result.add(tok);
        pos = pos2;
        tok2 = iter2.hasNext() ? iter2.next() : null;
        pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
      }
    }
    return result;
  }
Ejemplo n.º 4
0
  private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) {
    Token token =
        new Token(
            oriToken.buffer(),
            termBufferOffset,
            termBufferLength,
            oriToken.startOffset() + termBufferOffset,
            oriToken.startOffset() + termBufferOffset + termBufferLength);

    if (type == Character.DECIMAL_DIGIT_NUMBER) {
      token.setType(Word.TYPE_DIGIT);
    } else {
      token.setType(Word.TYPE_LETTER);
    }

    tokenQueue.offer(token);
  }
Ejemplo n.º 5
0
  @Override
  public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    LOG.debug("getSuggestions: " + options.tokens);
    if (lookup == null) {
      LOG.info("Lookup is null - invoke spellchecker.build first");
      return EMPTY_RESULT;
    }
    SpellingResult res = new SpellingResult();
    CharsRef scratch = new CharsRef();

    for (Token currentToken : options.tokens) {
      scratch.chars = currentToken.buffer();
      scratch.offset = 0;
      scratch.length = currentToken.length();
      boolean onlyMorePopular =
          (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR)
              && !(lookup instanceof WFSTCompletionLookup)
              && !(lookup instanceof AnalyzingSuggester);

      // get more than the requested suggestions as a lot get collapsed by the corrections
      List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count * 10);
      if (suggestions == null || suggestions.size() == 0) {
        continue;
      }

      if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) {
        Collections.sort(suggestions);
      }

      final LinkedHashMap<String, Integer> lhm = new LinkedHashMap<String, Integer>();
      for (LookupResult lr : suggestions) {
        String suggestion = lr.key.toString();
        if (this.suggestionAnalyzer != null) {
          String correction = getAnalyzerResult(suggestion);
          // multiple could map to the same, so don't repeat suggestions
          if (!isStringNullOrEmpty(correction)) {
            if (lhm.containsKey(correction)) {
              lhm.put(correction, lhm.get(correction) + (int) lr.value);
            } else {
              lhm.put(correction, (int) lr.value);
            }
          }
        } else {
          lhm.put(suggestion, (int) lr.value);
        }

        if (lhm.size() >= options.count) {
          break;
        }
      }

      // sort by new doc frequency
      Map<String, Integer> orderedMap = null;
      if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) {
        // retain the sort order from above
        orderedMap = lhm;
      } else {
        orderedMap =
            new TreeMap<String, Integer>(
                new Comparator<String>() {
                  @Override
                  public int compare(String s1, String s2) {
                    return lhm.get(s2).compareTo(lhm.get(s1));
                  }
                });
        orderedMap.putAll(lhm);
      }

      for (Map.Entry<String, Integer> entry : orderedMap.entrySet()) {
        res.add(currentToken, entry.getKey(), entry.getValue());
      }
    }
    return res;
  }