private Token nextToken(Token reusableToken) throws IOException { assert reusableToken != null; // 先使用上次留下来的。 Token nextToken = tokenQueue.poll(); if (nextToken != null) { return nextToken; } /*//在 TokenUtils.nextToken 已经调用了 inc if(!input.incrementToken()) { return null; }*/ /*TermAttribute termAtt = (TermAttribute)input.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute)input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = (TypeAttribute)input.getAttribute(TypeAttribute.class); nextToken = reusableToken.reinit(termAtt.termBuffer(), 0, termAtt.termLength(), offsetAtt.startOffset(), offsetAtt.endOffset(), typeAtt.type());*/ nextToken = TokenUtils.nextToken(input, reusableToken); if (nextToken != null && (Word.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type()) || Word.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type()))) { final char[] buffer = nextToken.buffer(); final int length = nextToken.length(); byte lastType = (byte) Character.getType(buffer[0]); // 与上次的字符是否同类 int termBufferOffset = 0; int termBufferLength = 0; for (int i = 0; i < length; i++) { byte type = (byte) Character.getType(buffer[i]); if (type <= Character.MODIFIER_LETTER) { type = Character.LOWERCASE_LETTER; } if (type != lastType) { // 与上一次的不同 addToken(nextToken, termBufferOffset, termBufferLength, lastType); termBufferOffset += termBufferLength; termBufferLength = 0; lastType = type; } termBufferLength++; } if (termBufferLength > 0) { // 最后一次 addToken(nextToken, termBufferOffset, termBufferLength, lastType); } nextToken = tokenQueue.poll(); } return nextToken; }
public boolean incrementToken() throws IOException { clearAttributes(); Token token = nextToken(reusableToken); if (token != null) { termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); return true; } else { end(); return false; } }
/** * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that * the tokens end up at the same position. * * <p>Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same * position) Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a * has posInc=n) */ public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) { ArrayList<Token> result = new ArrayList<Token>(); if (lst1 == null || lst2 == null) { if (lst2 != null) result.addAll(lst2); if (lst1 != null) result.addAll(lst1); return result; } int pos = 0; Iterator<Token> iter1 = lst1.iterator(); Iterator<Token> iter2 = lst2.iterator(); Token tok1 = iter1.hasNext() ? iter1.next() : null; Token tok2 = iter2.hasNext() ? iter2.next() : null; int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0; int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0; while (tok1 != null || tok2 != null) { while (tok1 != null && (pos1 <= pos2 || tok2 == null)) { Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type()); tok.copyBuffer(tok1.buffer(), 0, tok1.length()); tok.setPositionIncrement(pos1 - pos); result.add(tok); pos = pos1; tok1 = iter1.hasNext() ? iter1.next() : null; pos1 += tok1 != null ? tok1.getPositionIncrement() : 0; } while (tok2 != null && (pos2 <= pos1 || tok1 == null)) { Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type()); tok.copyBuffer(tok2.buffer(), 0, tok2.length()); tok.setPositionIncrement(pos2 - pos); result.add(tok); pos = pos2; tok2 = iter2.hasNext() ? iter2.next() : null; pos2 += tok2 != null ? tok2.getPositionIncrement() : 0; } } return result; }
@Override public boolean incrementToken() throws IOException { if (index >= tokens.length) return false; else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } }
public Token next() throws IOException { Token candidate; while ((candidate = baseTokeniser.next()) != null) { try { Integer integer = Integer.valueOf(candidate.termText()); String valueString = NumericEncoder.encode(integer.intValue()); Token integerToken = new Token( valueString, candidate.startOffset(), candidate.startOffset(), candidate.type()); return integerToken; } catch (NumberFormatException e) { // just ignore and try the next one } } return null; }
public final Token next() throws IOException { Token localToken = this.input.next(); if (localToken == null) return null; String str1 = localToken.termText(); String str2 = localToken.type(); if ((str2 == APOSTROPHE_TYPE) && ((str1.endsWith("'s")) || (str1.endsWith("'S")))) return new Token( str1.substring(0, str1.length() - 2), localToken.startOffset(), localToken.endOffset(), str2); if (str2 == ACRONYM_TYPE) { StringBuffer localStringBuffer = new StringBuffer(); for (int i = 0; i < str1.length(); i++) { char c = str1.charAt(i); if (c != '.') localStringBuffer.append(c); } return new Token( localStringBuffer.toString(), localToken.startOffset(), localToken.endOffset(), str2); } return localToken; }
/** * Returns the next token in the stream, or null at EOS. * * <p>Removes <tt>'s</tt> from the end of words. * * <p>Removes dots from acronyms. * * <p>Splits host names ... */ public final org.apache.lucene.analysis.Token next() throws java.io.IOException { if (hostTokens == null) { org.apache.lucene.analysis.Token t = input.next(); if (t == null) return null; String text = t.termText(); String type = t.type(); if (type == APOSTROPHE_TYPE && // remove 's (text.endsWith("'s") || text.endsWith("'S"))) { return new org.apache.lucene.analysis.Token( text.substring(0, text.length() - 2), t.startOffset(), t.endOffset(), type); } else if (type == ACRONYM_TYPE) { // remove dots StringBuffer trimmed = new StringBuffer(); for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (c != '.') trimmed.append(c); } return new org.apache.lucene.analysis.Token( trimmed.toString(), t.startOffset(), t.endOffset(), type); } else if (type == HOST_TYPE) { // <HOST: <ALPHANUM> ("." <ALPHANUM>)+ > // There must be at least two tokens .... hostTokens = new LinkedList<org.apache.lucene.analysis.Token>(); StringTokenizer tokeniser = new StringTokenizer(text, "."); int start = t.startOffset(); int end; while (tokeniser.hasMoreTokens()) { String token = tokeniser.nextToken(); end = start + token.length(); hostTokens.offer(new org.apache.lucene.analysis.Token(token, start, end, ALPHANUM_TYPE)); start = end + 1; } // check if we have an acronym ..... yes a.b.c ends up here ... if (text.length() == hostTokens.size() * 2 - 1) { hostTokens = null; // acronym StringBuffer trimmed = new StringBuffer(); for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (c != '.') trimmed.append(c); } return new org.apache.lucene.analysis.Token( trimmed.toString(), t.startOffset(), t.endOffset(), ALPHANUM_TYPE); } else { return hostTokens.remove(); } } else { return t; } } else { org.apache.lucene.analysis.Token token = hostTokens.remove(); if (hostTokens.isEmpty()) { hostTokens = null; } return token; } }