Exemplo n.º 1
0
  private Token nextToken(Token reusableToken) throws IOException {
    assert reusableToken != null;

    // 先使用上次留下来的。
    Token nextToken = tokenQueue.poll();
    if (nextToken != null) {
      return nextToken;
    }

    /*//在 TokenUtils.nextToken 已经调用了 inc
    if(!input.incrementToken()) {
    	return null;
    }*/

    /*TermAttribute termAtt = (TermAttribute)input.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute)input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute)input.getAttribute(TypeAttribute.class);

    nextToken = reusableToken.reinit(termAtt.termBuffer(), 0, termAtt.termLength(), offsetAtt.startOffset(), offsetAtt.endOffset(), typeAtt.type());*/

    nextToken = TokenUtils.nextToken(input, reusableToken);

    if (nextToken != null
        && (Word.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type())
            || Word.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type()))) {
      final char[] buffer = nextToken.buffer();
      final int length = nextToken.length();
      byte lastType = (byte) Character.getType(buffer[0]); // 与上次的字符是否同类
      int termBufferOffset = 0;
      int termBufferLength = 0;
      for (int i = 0; i < length; i++) {
        byte type = (byte) Character.getType(buffer[i]);
        if (type <= Character.MODIFIER_LETTER) {
          type = Character.LOWERCASE_LETTER;
        }
        if (type != lastType) { // 与上一次的不同
          addToken(nextToken, termBufferOffset, termBufferLength, lastType);

          termBufferOffset += termBufferLength;
          termBufferLength = 0;

          lastType = type;
        }

        termBufferLength++;
      }
      if (termBufferLength > 0) { // 最后一次
        addToken(nextToken, termBufferOffset, termBufferLength, lastType);
      }
      nextToken = tokenQueue.poll();
    }

    return nextToken;
  }
Exemplo n.º 2
0
 public boolean incrementToken() throws IOException {
   clearAttributes();
   Token token = nextToken(reusableToken);
   if (token != null) {
     termAtt.copyBuffer(token.buffer(), 0, token.length());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     typeAtt.setType(token.type());
     return true;
   } else {
     end();
     return false;
   }
 }
Exemplo n.º 3
0
  /**
   * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
   * the tokens end up at the same position.
   *
   * <p>Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same
   * position) Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a
   * has posInc=n)
   */
  public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
    ArrayList<Token> result = new ArrayList<Token>();
    if (lst1 == null || lst2 == null) {
      if (lst2 != null) result.addAll(lst2);
      if (lst1 != null) result.addAll(lst1);
      return result;
    }

    int pos = 0;
    Iterator<Token> iter1 = lst1.iterator();
    Iterator<Token> iter2 = lst2.iterator();
    Token tok1 = iter1.hasNext() ? iter1.next() : null;
    Token tok2 = iter2.hasNext() ? iter2.next() : null;
    int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
    int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
    while (tok1 != null || tok2 != null) {
      while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
        Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
        tok.copyBuffer(tok1.buffer(), 0, tok1.length());
        tok.setPositionIncrement(pos1 - pos);
        result.add(tok);
        pos = pos1;
        tok1 = iter1.hasNext() ? iter1.next() : null;
        pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
      }
      while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
        Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
        tok.copyBuffer(tok2.buffer(), 0, tok2.length());
        tok.setPositionIncrement(pos2 - pos);
        result.add(tok);
        pos = pos2;
        tok2 = iter2.hasNext() ? iter2.next() : null;
        pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
      }
    }
    return result;
  }
Exemplo n.º 4
0
 @Override
 public boolean incrementToken() throws IOException {
   if (index >= tokens.length) return false;
   else {
     clearAttributes();
     Token token = tokens[index++];
     termAtt.setEmpty().append(token);
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     flagsAtt.setFlags(token.getFlags());
     typeAtt.setType(token.type());
     payloadAtt.setPayload(token.getPayload());
     return true;
   }
 }
 public Token next() throws IOException {
   Token candidate;
   while ((candidate = baseTokeniser.next()) != null) {
     try {
       Integer integer = Integer.valueOf(candidate.termText());
       String valueString = NumericEncoder.encode(integer.intValue());
       Token integerToken =
           new Token(
               valueString, candidate.startOffset(), candidate.startOffset(), candidate.type());
       return integerToken;
     } catch (NumberFormatException e) {
       // just ignore and try the next one
     }
   }
   return null;
 }
 public final Token next() throws IOException {
   Token localToken = this.input.next();
   if (localToken == null) return null;
   String str1 = localToken.termText();
   String str2 = localToken.type();
   if ((str2 == APOSTROPHE_TYPE) && ((str1.endsWith("'s")) || (str1.endsWith("'S"))))
     return new Token(
         str1.substring(0, str1.length() - 2),
         localToken.startOffset(),
         localToken.endOffset(),
         str2);
   if (str2 == ACRONYM_TYPE) {
     StringBuffer localStringBuffer = new StringBuffer();
     for (int i = 0; i < str1.length(); i++) {
       char c = str1.charAt(i);
       if (c != '.') localStringBuffer.append(c);
     }
     return new Token(
         localStringBuffer.toString(), localToken.startOffset(), localToken.endOffset(), str2);
   }
   return localToken;
 }
  /**
   * Returns the next token in the stream, or null at EOS.
   *
   * <p>Removes <tt>'s</tt> from the end of words.
   *
   * <p>Removes dots from acronyms.
   *
   * <p>Splits host names ...
   */
  public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
    if (hostTokens == null) {
      org.apache.lucene.analysis.Token t = input.next();

      if (t == null) return null;

      String text = t.termText();
      String type = t.type();

      if (type == APOSTROPHE_TYPE
          && // remove 's
          (text.endsWith("'s") || text.endsWith("'S"))) {
        return new org.apache.lucene.analysis.Token(
            text.substring(0, text.length() - 2), t.startOffset(), t.endOffset(), type);

      } else if (type == ACRONYM_TYPE) { // remove dots
        StringBuffer trimmed = new StringBuffer();
        for (int i = 0; i < text.length(); i++) {
          char c = text.charAt(i);
          if (c != '.') trimmed.append(c);
        }
        return new org.apache.lucene.analysis.Token(
            trimmed.toString(), t.startOffset(), t.endOffset(), type);

      } else if (type == HOST_TYPE) {
        // <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
        // There must be at least two tokens ....
        hostTokens = new LinkedList<org.apache.lucene.analysis.Token>();
        StringTokenizer tokeniser = new StringTokenizer(text, ".");
        int start = t.startOffset();
        int end;
        while (tokeniser.hasMoreTokens()) {
          String token = tokeniser.nextToken();
          end = start + token.length();
          hostTokens.offer(new org.apache.lucene.analysis.Token(token, start, end, ALPHANUM_TYPE));
          start = end + 1;
        }
        // check if we have an acronym ..... yes a.b.c ends up here ...

        if (text.length() == hostTokens.size() * 2 - 1) {
          hostTokens = null;
          // acronym
          StringBuffer trimmed = new StringBuffer();
          for (int i = 0; i < text.length(); i++) {
            char c = text.charAt(i);
            if (c != '.') trimmed.append(c);
          }
          return new org.apache.lucene.analysis.Token(
              trimmed.toString(), t.startOffset(), t.endOffset(), ALPHANUM_TYPE);
        } else {
          return hostTokens.remove();
        }
      } else {
        return t;
      }
    } else {
      org.apache.lucene.analysis.Token token = hostTokens.remove();
      if (hostTokens.isEmpty()) {
        hostTokens = null;
      }
      return token;
    }
  }