public Token next() throws IOException {
   Token candidate;
   while ((candidate = baseTokeniser.next()) != null) {
     try {
       Integer integer = Integer.valueOf(candidate.termText());
       String valueString = NumericEncoder.encode(integer.intValue());
       Token integerToken =
           new Token(
               valueString, candidate.startOffset(), candidate.startOffset(), candidate.type());
       return integerToken;
     } catch (NumberFormatException e) {
       // just ignore and try the next one
     }
   }
   return null;
 }
 public final Token next() throws IOException {
   Token localToken = this.input.next();
   if (localToken == null) return null;
   String str1 = localToken.termText();
   String str2 = localToken.type();
   if ((str2 == APOSTROPHE_TYPE) && ((str1.endsWith("'s")) || (str1.endsWith("'S"))))
     return new Token(
         str1.substring(0, str1.length() - 2),
         localToken.startOffset(),
         localToken.endOffset(),
         str2);
   if (str2 == ACRONYM_TYPE) {
     StringBuffer localStringBuffer = new StringBuffer();
     for (int i = 0; i < str1.length(); i++) {
       char c = str1.charAt(i);
       if (c != '.') localStringBuffer.append(c);
     }
     return new Token(
         localStringBuffer.toString(), localToken.startOffset(), localToken.endOffset(), str2);
   }
   return localToken;
 }
Ejemplo n.º 3
0
 public boolean incrementToken() throws IOException {
   clearAttributes();
   Token token = nextToken(reusableToken);
   if (token != null) {
     termAtt.copyBuffer(token.buffer(), 0, token.length());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     typeAtt.setType(token.type());
     return true;
   } else {
     end();
     return false;
   }
 }
 /**
  * For languages with canonical form
  *
  * @return canonical token (or null if none)
  */
 public Token canonizeToken(Token t) {
   if (!hasCanonicalFilter) return null;
   if (lang.equals("sr")) {
     String nt = new SerbianFilter(null).convert(t.termText());
     if (!t.equals(nt)) {
       Token tt = new Token(nt, t.startOffset(), t.endOffset());
       tt.setPositionIncrement(0);
       tt.setType("alias");
       return tt;
     }
   }
   return null;
 }
Ejemplo n.º 5
0
  /**
   * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
   * the tokens end up at the same position.
   *
   * <p>Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same
   * position) Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a
   * has posInc=n)
   */
  public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
    ArrayList<Token> result = new ArrayList<Token>();
    if (lst1 == null || lst2 == null) {
      if (lst2 != null) result.addAll(lst2);
      if (lst1 != null) result.addAll(lst1);
      return result;
    }

    int pos = 0;
    Iterator<Token> iter1 = lst1.iterator();
    Iterator<Token> iter2 = lst2.iterator();
    Token tok1 = iter1.hasNext() ? iter1.next() : null;
    Token tok2 = iter2.hasNext() ? iter2.next() : null;
    int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
    int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
    while (tok1 != null || tok2 != null) {
      while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
        Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
        tok.copyBuffer(tok1.buffer(), 0, tok1.length());
        tok.setPositionIncrement(pos1 - pos);
        result.add(tok);
        pos = pos1;
        tok1 = iter1.hasNext() ? iter1.next() : null;
        pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
      }
      while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
        Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
        tok.copyBuffer(tok2.buffer(), 0, tok2.length());
        tok.setPositionIncrement(pos2 - pos);
        result.add(tok);
        pos = pos2;
        tok2 = iter2.hasNext() ? iter2.next() : null;
        pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
      }
    }
    return result;
  }
Ejemplo n.º 6
0
 @Override
 public boolean incrementToken() throws IOException {
   if (index >= tokens.length) return false;
   else {
     clearAttributes();
     Token token = tokens[index++];
     termAtt.setEmpty().append(token);
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     flagsAtt.setFlags(token.getFlags());
     typeAtt.setType(token.type());
     payloadAtt.setPayload(token.getPayload());
     return true;
   }
 }
Ejemplo n.º 7
0
  private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) {
    Token token =
        new Token(
            oriToken.buffer(),
            termBufferOffset,
            termBufferLength,
            oriToken.startOffset() + termBufferOffset,
            oriToken.startOffset() + termBufferOffset + termBufferLength);

    if (type == Character.DECIMAL_DIGIT_NUMBER) {
      token.setType(Word.TYPE_DIGIT);
    } else {
      token.setType(Word.TYPE_LETTER);
    }

    tokenQueue.offer(token);
  }
Ejemplo n.º 8
0
 @Override
 public boolean incrementToken() {
   if (upto < tokens.length) {
     final Token token = tokens[upto++];
     // TODO: can we just capture/restoreState so
     // we get all attrs...?
     clearAttributes();
     termAtt.setEmpty();
     termAtt.append(token.toString());
     posIncrAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     payloadAtt.setPayload(token.getPayload());
     return true;
   } else {
     return false;
   }
 }
Ejemplo n.º 9
0
 @Override
 public boolean incrementToken() throws IOException {
   if (tokens == null) {
     fillTokens();
   }
   // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
   if (upto == tokens.size()) {
     // System.out.println("  END @ " + tokens.size());
     return false;
   }
   final Token t = tokens.get(upto++);
   // System.out.println("  return token=" + t);
   clearAttributes();
   termAtt.append(t.toString());
   offsetAtt.setOffset(t.startOffset(), t.endOffset());
   posIncrAtt.setPositionIncrement(t.getPositionIncrement());
   posLengthAtt.setPositionLength(t.getPositionLength());
   return true;
 }
Ejemplo n.º 10
0
    public Token next(Token reusableToken) throws IOException {
      Token token = reusableToken;

      if (inPhrase) {
        inPhrase = false;
        token.setTermBuffer("phrase2");
        token.setStartOffset(savedStart);
        token.setEndOffset(savedEnd);
        return reusableToken;
      } else
        while ((token = this.input.next(reusableToken)) != null) {
          if (token.term().equals("phrase")) {
            inPhrase = true;
            savedStart = token.startOffset();
            savedEnd = token.endOffset();
            token.setTermBuffer("phrase1");
            token.setStartOffset(savedStart);
            token.setEndOffset(savedEnd);
            return token;
          } else if (!token.term().equals("stop")) return token;
        }

      return null;
    }
  /**
   * Returns the next token in the stream, or null at EOS.
   *
   * <p>Removes <tt>'s</tt> from the end of words.
   *
   * <p>Removes dots from acronyms.
   *
   * <p>Splits host names ...
   */
  public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
    if (hostTokens == null) {
      org.apache.lucene.analysis.Token t = input.next();

      if (t == null) return null;

      String text = t.termText();
      String type = t.type();

      if (type == APOSTROPHE_TYPE
          && // remove 's
          (text.endsWith("'s") || text.endsWith("'S"))) {
        return new org.apache.lucene.analysis.Token(
            text.substring(0, text.length() - 2), t.startOffset(), t.endOffset(), type);

      } else if (type == ACRONYM_TYPE) { // remove dots
        StringBuffer trimmed = new StringBuffer();
        for (int i = 0; i < text.length(); i++) {
          char c = text.charAt(i);
          if (c != '.') trimmed.append(c);
        }
        return new org.apache.lucene.analysis.Token(
            trimmed.toString(), t.startOffset(), t.endOffset(), type);

      } else if (type == HOST_TYPE) {
        // <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
        // There must be at least two tokens ....
        hostTokens = new LinkedList<org.apache.lucene.analysis.Token>();
        StringTokenizer tokeniser = new StringTokenizer(text, ".");
        int start = t.startOffset();
        int end;
        while (tokeniser.hasMoreTokens()) {
          String token = tokeniser.nextToken();
          end = start + token.length();
          hostTokens.offer(new org.apache.lucene.analysis.Token(token, start, end, ALPHANUM_TYPE));
          start = end + 1;
        }
        // check if we have an acronym ..... yes a.b.c ends up here ...

        if (text.length() == hostTokens.size() * 2 - 1) {
          hostTokens = null;
          // acronym
          StringBuffer trimmed = new StringBuffer();
          for (int i = 0; i < text.length(); i++) {
            char c = text.charAt(i);
            if (c != '.') trimmed.append(c);
          }
          return new org.apache.lucene.analysis.Token(
              trimmed.toString(), t.startOffset(), t.endOffset(), ALPHANUM_TYPE);
        } else {
          return hostTokens.remove();
        }
      } else {
        return t;
      }
    } else {
      org.apache.lucene.analysis.Token token = hostTokens.remove();
      if (hostTokens.isEmpty()) {
        hostTokens = null;
      }
      return token;
    }
  }
 private void applyToken(Token token) {
   termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
   posAtt.setPositionIncrement(token.getPositionIncrement());
   offsetAtt.setOffset(token.startOffset(), token.endOffset());
 }