Example #1
0
  @Override
  public final boolean incrementToken() throws IOException {
    if (isMailto) {
      termAtt.setEmpty();
      // return the scheme + the mail part
      isMailto = false;
      posIncrAtt.setPositionIncrement(0);
      termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position());
      return true;
    }

    if (input.incrementToken()) {
      final String type = typeAtt.type();
      if (type.equals(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI])
          && this.isMailtoScheme()) {
        this.updateBuffer();
        termBuffer.put(termAtt.buffer(), 0, termAtt.length());
        // return only the mail part
        posIncrAtt.setPositionIncrement(1);
        termAtt.copyBuffer(termBuffer.array(), 7, termBuffer.position() - 7);
      }
      return true;
    }
    return false;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!terms.isEmpty()) {
      char[] buffer = terms.poll();
      termAttribute.setEmpty();
      termAttribute.copyBuffer(buffer, 0, buffer.length);
      posIncAttr.setPositionIncrement(1);
      return true;
    }

    if (!input.incrementToken()) {
      return false;
    } else {
      final char term[] = termAttribute.buffer();
      final int length = termAttribute.length();

      int k = 0;
      for (; k < length; k++) {
        if (term[k] == tokenDelimiter) {
          break;
        }
      }

      LinkedList<CharBuffer> buffers = permuteTerms(term, 0, length);

      Iterator iter = buffers.iterator();
      while (iter.hasNext()) {
        CharBuffer cb = (CharBuffer) iter.next();
        terms.add(cb.array());
      }

      // we return true and leave the original token unchanged
      return true;
    }
  }
Example #3
0
  private boolean getNextToken(final int pos) throws IOException {
    assert pos >= 0;

    final boolean ret;

    if (pos == ngramSize) {
      ret = true;
    } else {
      final int ich = input.read();
      if (ich == -1) {
        termAtt.setEmpty();
        ret = false;
      } else {
        final char ch = (char) ich;
        if (ch == ' ') {
          ret = getNextToken();
        } else {
          termAtt.append(ch);
          ret = getNextToken(pos + 1);
        }
      }
    }

    return ret;
  }
  @Override
  public final boolean incrementToken() throws IOException {
    if (!tokens.isEmpty()) {
      assert current != null;
      CompoundToken token = tokens.removeFirst();
      restoreState(current); // keep all other attributes untouched
      termAtt.setEmpty().append(token.txt);
      offsetAtt.setOffset(token.startOffset, token.endOffset);
      posIncAtt.setPositionIncrement(0);
      return true;
    }

    current = null; // not really needed, but for safety
    if (input.incrementToken()) {
      // Only words longer than minWordSize get processed
      if (termAtt.length() >= this.minWordSize) {
        decompose();
        // only capture the state if we really need it for producing new tokens
        if (!tokens.isEmpty()) {
          current = captureState();
        }
      }
      // return original token:
      return true;
    } else {
      return false;
    }
  }
 @Override
 public final boolean incrementToken() throws IOException {
   int ch = input.read();
   if (ch < 0) return false;
   clearAttributes();
   termAtt.setEmpty().append((char) ch);
   return true;
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (!getNextPartialSnippet()) return false;
   clearAttributes();
   termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
   offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
   return true;
 }
  @Override
  public boolean incrementToken() throws IOException {
    if (index >= str.length()) return false;
    clearAttributes();
    if (group >= 0) {

      // match a specific group
      while (matcher.find()) {
        final String match = matcher.group(group);
        if (match.length() == 0) continue;
        termAtt.setEmpty().append(match);
        index = matcher.start(group);
        offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
        return true;
      }

      index = Integer.MAX_VALUE; // mark exhausted
      return false;

    } else {

      // String.split() functionality
      while (matcher.find()) {
        if (matcher.start() - index > 0) {
          // found a non-zero-length token
          termAtt.setEmpty().append(str, index, matcher.start());
          offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
          index = matcher.end();
          return true;
        }

        index = matcher.end();
      }

      if (str.length() - index == 0) {
        index = Integer.MAX_VALUE; // mark exhausted
        return false;
      }

      termAtt.setEmpty().append(str, index, str.length());
      offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
      index = Integer.MAX_VALUE; // mark exhausted
      return true;
    }
  }
  /** TODO: rewrite tests not to use string comparison. */
  private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
      if (out.length() > 0) out.append(' ');
      out.append(termAtt.toString());
      in.clearAttributes();
      termAtt.setEmpty().append("bogusTerm");
    }

    in.close();
    return out.toString();
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (currentPrefix != null) {
     if (!currentPrefix.hasNext()) {
       return input.incrementToken();
     } else {
       posAttr.setPositionIncrement(0);
     }
   } else {
     currentPrefix = prefixes.iterator();
     termAttr.setEmpty();
     posAttr.setPositionIncrement(1);
     assert (currentPrefix.hasNext()) : "one or more prefixes needed";
   }
   termAttr.setEmpty();
   termAttr.append(currentPrefix.next());
   termAttr.append(separator);
   return true;
 }
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    BytesRef bytes = termsEnum.next();
    if (bytes == null) return false;
    charTerm.setEmpty();
    charTerm.append(bytes.utf8ToString());
    return true;
  }
 @Override
 public boolean incrementToken() throws IOException {
   if (inPhrase) {
     inPhrase = false;
     clearAttributes();
     termAtt.setEmpty().append("phrase2");
     offsetAtt.setOffset(savedStart, savedEnd);
     return true;
   } else
     while (input.incrementToken()) {
       if (termAtt.toString().equals("phrase")) {
         inPhrase = true;
         savedStart = offsetAtt.startOffset();
         savedEnd = offsetAtt.endOffset();
         termAtt.setEmpty().append("phrase1");
         offsetAtt.setOffset(savedStart, savedEnd);
         return true;
       } else if (!termAtt.toString().equals("stop")) return true;
     }
   return false;
 }
  private void setAttributesFromQueue(boolean isFirst) {
    final KoreanToken iw = morphQueue.removeFirst();
    if (isFirst && !morphQueue.isEmpty()) {
      // our queue has more elements remaining (e.g. we decompounded)
      // capture state for those. We set the term attribute to be empty
      // so we save lots of array copying later.
      termAtt.setEmpty();
      currentState = captureState();
    }

    termAtt.setEmpty().append(iw.getTerm());
    offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + iw.getLength());
    morphAtt.setToken(iw);

    // on the first Token we preserve incoming increment:
    if (!isFirst) {
      posIncrAtt.setPositionIncrement(iw.getPosInc());
    }

    // TODO: How to handle PositionLengthAttribute correctly?
  }
 @Override
 public boolean incrementToken() {
   clearAttributes();
   if (upto == 4) {
     return false;
   }
   if (upto == 0) {
     posIncr.setPositionIncrement(1);
     term.setEmpty().append("a");
   } else if (upto == 1) {
     posIncr.setPositionIncrement(1);
     term.setEmpty().append("b");
   } else if (upto == 2) {
     posIncr.setPositionIncrement(0);
     term.setEmpty().append("c");
   } else {
     posIncr.setPositionIncrement(0);
     term.setEmpty().append("d");
   }
   upto++;
   return true;
 }
  @Override
  public boolean incrementToken() throws IOException {
    clearAttributes();

    // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据
    while (tokenIteractor == null || !tokenIteractor.hasNext()) {
      // System.out.println(dissected);
      int read = 0;
      int remainning = -1; // 重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符
      if (dissected >= beef.length()) {
        remainning = 0;
      } else if (dissected < 0) {
        remainning = bufferLength + dissected;
      }
      if (remainning >= 0) {
        if (remainning > 0) {
          System.arraycopy(buffer, -dissected, buffer, 0, remainning);
        }
        read = input.read(buffer, remainning, bufferLength - remainning);
        inputLength += read;
        int charCount = remainning + read;
        if (charCount < 0) {
          // reader已尽,按接口next()要求返回null.
          return false;
        }
        if (charCount < bufferLength) {
          buffer[charCount++] = 0;
        }
        // 构造“牛”,并使用knife“解”之
        beef.set(0, charCount);
        offset += Math.abs(dissected);
        // offset -= remainning;
        dissected = 0;
      }
      dissected = knife.dissect(this, beef, dissected);
      // offset += read;// !!!
      tokenIteractor = tokenCollector.iterator();
    }

    if (tokenIteractor.hasNext()) {
      // 返回tokensIteractor下一个Token对象
      Token token = tokenIteractor.next();
      termAtt.setEmpty();
      termAtt.append(token.charSequence());
      offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset()));
      positionIncrementAttribute.setPositionIncrement(token.endOffset());
      return true;
    }
    return tokenIteractor.hasNext();
  }
Example #15
0
 @Override
 public boolean incrementToken() throws IOException {
   if (index >= tokens.length) return false;
   else {
     clearAttributes();
     Token token = tokens[index++];
     termAtt.setEmpty().append(token);
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     flagsAtt.setFlags(token.getFlags());
     typeAtt.setType(token.type());
     payloadAtt.setPayload(token.getPayload());
     return true;
   }
 }
  /** @return Returns true for next token in the stream, or false at EOS */
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      String term = termAtt.toString();

      if (!keywordAttr.isKeyword()) {
        String s = stemmer.stem(term);
        // If not stemmed, don't waste the time adjusting the token.
        if ((s != null) && !s.equals(term)) termAtt.setEmpty().append(s);
      }
      return true;
    } else {
      return false;
    }
  }
 @Override
 public boolean incrementToken() throws IOException {
   boolean tokenAvailable = false;
   int builtGramSize = 0;
   if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
     shiftInputWindow();
     gramBuilder.setLength(0);
   } else {
     builtGramSize = gramSize.getPreviousValue();
   }
   if (inputWindow.size() >= gramSize.getValue()) {
     boolean isAllFiller = true;
     InputWindowToken nextToken = null;
     Iterator<InputWindowToken> iter = inputWindow.iterator();
     for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.getValue(); ++gramNum) {
       nextToken = iter.next();
       if (builtGramSize < gramNum) {
         if (builtGramSize > 0) {
           gramBuilder.append(tokenSeparator);
         }
         gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length());
         ++builtGramSize;
       }
       if (isAllFiller && nextToken.isFiller) {
         if (gramNum == gramSize.getValue()) {
           gramSize.advance();
         }
       } else {
         isAllFiller = false;
       }
     }
     if (!isAllFiller && builtGramSize == gramSize.getValue()) {
       inputWindow.getFirst().attSource.copyTo(this);
       posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
       termAtt.setEmpty().append(gramBuilder);
       if (gramSize.getValue() > 1) {
         typeAtt.setType(tokenType);
         noShingleOutput = false;
       }
       offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
       posLenAtt.setPositionLength(builtGramSize);
       isOutputHere = true;
       gramSize.advance();
       tokenAvailable = true;
     }
   }
   return tokenAvailable;
 }
  /**
   * @return true if token was added to search/analysis stream
   * @throws IOException
   */
  @Override
  public final boolean incrementToken() throws IOException {
    if (!input.incrementToken()) {
      return false;
    }

    Optional<CharSequence> lemma = lemmatizer.lemmatize(termAtt);

    if (lemma.isPresent()) {
      if (!keywordAttr.isKeyword() && !equalCharSequences(lemma.get(), termAtt)) {
        termAtt.setEmpty().append(lemma.get());
      }
    }

    return true;
  }
  @Override
  public boolean incrementToken() throws IOException {
    clearAttributes();
    buffer.setLength(0);
    int ci;
    char ch, pch;
    boolean atBegin = true;
    tokenStart = tokenEnd;
    ci = input.read();
    ch = (char) ci;

    while (true) {
      if (ci == -1) {
        break;
      } else if (PUNCTION.indexOf(ch) != -1) {
        // End of a sentence
        buffer.append(ch);
        tokenEnd++;
        break;
      } else if (atBegin && SPACES.indexOf(ch) != -1) {
        tokenStart++;
        tokenEnd++;
        ci = input.read();
        ch = (char) ci;
      } else {
        buffer.append(ch);
        atBegin = false;
        tokenEnd++;
        pch = ch;
        ci = input.read();
        ch = (char) ci;
        // Two spaces, such as CR, LF
        if (SPACES.indexOf(ch) != -1 && SPACES.indexOf(pch) != -1) {
          // buffer.append(ch);
          tokenEnd++;
          break;
        }
      }
    }
    if (buffer.length() == 0) return false;
    else {
      termAtt.setEmpty().append(buffer);
      offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
      typeAtt.setType("sentence");
      return true;
    }
  }
    @Override
    public final boolean incrementToken() throws IOException {
      if (addSynonym) { // inject our synonym
        clearAttributes();
        termAtt.setEmpty().append("國");
        posIncAtt.setPositionIncrement(0);
        addSynonym = false;
        return true;
      }

      if (input.incrementToken()) {
        addSynonym = termAtt.toString().equals("国");
        return true;
      } else {
        return false;
      }
    }
Example #21
0
 @Override
 public boolean incrementToken() {
   if (upto < tokens.length) {
     final Token token = tokens[upto++];
     // TODO: can we just capture/restoreState so
     // we get all attrs...?
     clearAttributes();
     termAtt.setEmpty();
     termAtt.append(token.toString());
     posIncrAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     payloadAtt.setPayload(token.getPayload());
     return true;
   } else {
     return false;
   }
 }
  private void emit(char[] token) {
    Log.debug("emit: " + new String(token));
    if (replaceWhitespaceWith != null) {
      token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    termAttr.setEmpty();
    termAttr.append(new StringBuilder().append(token));

    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
      int start = offAttr.endOffset() - token.length;
      offAttr.setOffset(start, offAttr.endOffset());
    }

    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
      pia.setPositionIncrement(++positionIncr);
    }

    lastEmitted = token;
  }
Example #23
0
 private boolean getNextToken() throws IOException {
   termAtt.setEmpty();
   return getNextToken(0);
 }
 public DataTokenStream(String text, IntEncoder encoder) throws IOException {
   this.encoder = encoder;
   term.setEmpty().append(text);
 }
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    if (!done) {
      done = true;
      int upto = 0;
      char[] buffer = termAtt.buffer();
      while (true) {
        final int length = input.read(buffer, upto, buffer.length - upto);
        if (length == -1) break;
        upto += length;
        if (upto == buffer.length) buffer = termAtt.resizeBuffer(1 + buffer.length);
      }
      termAtt.setLength(upto);
      String str = termAtt.toString();
      termAtt.setEmpty();
      StringBuilder stringBuilder = new StringBuilder();
      StringBuilder firstLetters = new StringBuilder();
      for (int i = 0; i < str.length(); i++) {
        char c = str.charAt(i);
        if (c < 128) {
          stringBuilder.append(c);
        } else {
          try {
            String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);
            if (strs != null) {
              // get first result by default
              String first_value = strs[0];
              // TODO more than one pinyin
              stringBuilder.append(first_value);
              if (this.padding_char.length() > 0) {
                stringBuilder.append(this.padding_char);
              }
              firstLetters.append(first_value.charAt(0));
            }
          } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
            badHanyuPinyinOutputFormatCombination.printStackTrace();
          }
        }
      }

      // let's join them
      if (first_letter.equals("prefix")) {
        termAtt.append(firstLetters.toString());
        if (this.padding_char.length() > 0) {
          termAtt.append(this.padding_char); // TODO splitter
        }
        termAtt.append(stringBuilder.toString());
      } else if (first_letter.equals("append")) {
        termAtt.append(stringBuilder.toString());
        if (this.padding_char.length() > 0) {
          if (!stringBuilder.toString().endsWith(this.padding_char)) {
            termAtt.append(this.padding_char);
          }
        }
        termAtt.append(firstLetters.toString());
      } else if (first_letter.equals("none")) {
        termAtt.append(stringBuilder.toString());
      } else if (first_letter.equals("only")) {
        termAtt.append(firstLetters.toString());
      }

      finalOffset = correctOffset(upto);
      offsetAtt.setOffset(correctOffset(0), finalOffset);
      return true;
    }
    return false;
  }