public Token next() throws IOException { Token candidate; while ((candidate = baseTokeniser.next()) != null) { try { Integer integer = Integer.valueOf(candidate.termText()); String valueString = NumericEncoder.encode(integer.intValue()); Token integerToken = new Token( valueString, candidate.startOffset(), candidate.startOffset(), candidate.type()); return integerToken; } catch (NumberFormatException e) { // just ignore and try the next one } } return null; }
public final Token next() throws IOException { Token localToken = this.input.next(); if (localToken == null) return null; String str1 = localToken.termText(); String str2 = localToken.type(); if ((str2 == APOSTROPHE_TYPE) && ((str1.endsWith("'s")) || (str1.endsWith("'S")))) return new Token( str1.substring(0, str1.length() - 2), localToken.startOffset(), localToken.endOffset(), str2); if (str2 == ACRONYM_TYPE) { StringBuffer localStringBuffer = new StringBuffer(); for (int i = 0; i < str1.length(); i++) { char c = str1.charAt(i); if (c != '.') localStringBuffer.append(c); } return new Token( localStringBuffer.toString(), localToken.startOffset(), localToken.endOffset(), str2); } return localToken; }
public boolean incrementToken() throws IOException { clearAttributes(); Token token = nextToken(reusableToken); if (token != null) { termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); return true; } else { end(); return false; } }
/** * For languages with canonical form * * @return canonical token (or null if none) */ public Token canonizeToken(Token t) { if (!hasCanonicalFilter) return null; if (lang.equals("sr")) { String nt = new SerbianFilter(null).convert(t.termText()); if (!t.equals(nt)) { Token tt = new Token(nt, t.startOffset(), t.endOffset()); tt.setPositionIncrement(0); tt.setType("alias"); return tt; } } return null; }
/** * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that * the tokens end up at the same position. * * <p>Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same * position) Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a * has posInc=n) */ public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) { ArrayList<Token> result = new ArrayList<Token>(); if (lst1 == null || lst2 == null) { if (lst2 != null) result.addAll(lst2); if (lst1 != null) result.addAll(lst1); return result; } int pos = 0; Iterator<Token> iter1 = lst1.iterator(); Iterator<Token> iter2 = lst2.iterator(); Token tok1 = iter1.hasNext() ? iter1.next() : null; Token tok2 = iter2.hasNext() ? iter2.next() : null; int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0; int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0; while (tok1 != null || tok2 != null) { while (tok1 != null && (pos1 <= pos2 || tok2 == null)) { Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type()); tok.copyBuffer(tok1.buffer(), 0, tok1.length()); tok.setPositionIncrement(pos1 - pos); result.add(tok); pos = pos1; tok1 = iter1.hasNext() ? iter1.next() : null; pos1 += tok1 != null ? tok1.getPositionIncrement() : 0; } while (tok2 != null && (pos2 <= pos1 || tok1 == null)) { Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type()); tok.copyBuffer(tok2.buffer(), 0, tok2.length()); tok.setPositionIncrement(pos2 - pos); result.add(tok); pos = pos2; tok2 = iter2.hasNext() ? iter2.next() : null; pos2 += tok2 != null ? tok2.getPositionIncrement() : 0; } } return result; }
@Override public boolean incrementToken() throws IOException { if (index >= tokens.length) return false; else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } }
private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) { Token token = new Token( oriToken.buffer(), termBufferOffset, termBufferLength, oriToken.startOffset() + termBufferOffset, oriToken.startOffset() + termBufferOffset + termBufferLength); if (type == Character.DECIMAL_DIGIT_NUMBER) { token.setType(Word.TYPE_DIGIT); } else { token.setType(Word.TYPE_LETTER); } tokenQueue.offer(token); }
@Override public boolean incrementToken() { if (upto < tokens.length) { final Token token = tokens[upto++]; // TODO: can we just capture/restoreState so // we get all attrs...? clearAttributes(); termAtt.setEmpty(); termAtt.append(token.toString()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); posLengthAtt.setPositionLength(token.getPositionLength()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); payloadAtt.setPayload(token.getPayload()); return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { if (tokens == null) { fillTokens(); } // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); if (upto == tokens.size()) { // System.out.println(" END @ " + tokens.size()); return false; } final Token t = tokens.get(upto++); // System.out.println(" return token=" + t); clearAttributes(); termAtt.append(t.toString()); offsetAtt.setOffset(t.startOffset(), t.endOffset()); posIncrAtt.setPositionIncrement(t.getPositionIncrement()); posLengthAtt.setPositionLength(t.getPositionLength()); return true; }
public Token next(Token reusableToken) throws IOException { Token token = reusableToken; if (inPhrase) { inPhrase = false; token.setTermBuffer("phrase2"); token.setStartOffset(savedStart); token.setEndOffset(savedEnd); return reusableToken; } else while ((token = this.input.next(reusableToken)) != null) { if (token.term().equals("phrase")) { inPhrase = true; savedStart = token.startOffset(); savedEnd = token.endOffset(); token.setTermBuffer("phrase1"); token.setStartOffset(savedStart); token.setEndOffset(savedEnd); return token; } else if (!token.term().equals("stop")) return token; } return null; }
/** * Returns the next token in the stream, or null at EOS. * * <p>Removes <tt>'s</tt> from the end of words. * * <p>Removes dots from acronyms. * * <p>Splits host names ... */ public final org.apache.lucene.analysis.Token next() throws java.io.IOException { if (hostTokens == null) { org.apache.lucene.analysis.Token t = input.next(); if (t == null) return null; String text = t.termText(); String type = t.type(); if (type == APOSTROPHE_TYPE && // remove 's (text.endsWith("'s") || text.endsWith("'S"))) { return new org.apache.lucene.analysis.Token( text.substring(0, text.length() - 2), t.startOffset(), t.endOffset(), type); } else if (type == ACRONYM_TYPE) { // remove dots StringBuffer trimmed = new StringBuffer(); for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (c != '.') trimmed.append(c); } return new org.apache.lucene.analysis.Token( trimmed.toString(), t.startOffset(), t.endOffset(), type); } else if (type == HOST_TYPE) { // <HOST: <ALPHANUM> ("." <ALPHANUM>)+ > // There must be at least two tokens .... hostTokens = new LinkedList<org.apache.lucene.analysis.Token>(); StringTokenizer tokeniser = new StringTokenizer(text, "."); int start = t.startOffset(); int end; while (tokeniser.hasMoreTokens()) { String token = tokeniser.nextToken(); end = start + token.length(); hostTokens.offer(new org.apache.lucene.analysis.Token(token, start, end, ALPHANUM_TYPE)); start = end + 1; } // check if we have an acronym ..... yes a.b.c ends up here ... if (text.length() == hostTokens.size() * 2 - 1) { hostTokens = null; // acronym StringBuffer trimmed = new StringBuffer(); for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (c != '.') trimmed.append(c); } return new org.apache.lucene.analysis.Token( trimmed.toString(), t.startOffset(), t.endOffset(), ALPHANUM_TYPE); } else { return hostTokens.remove(); } } else { return t; } } else { org.apache.lucene.analysis.Token token = hostTokens.remove(); if (hostTokens.isEmpty()) { hostTokens = null; } return token; } }
private void applyToken(Token token) { termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength()); posAtt.setPositionIncrement(token.getPositionIncrement()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); }