private static Token token( String term, int posInc, int posLength, int startOffset, int endOffset) { final Token t = new Token(term, startOffset, endOffset); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; }
@Test public void testCountPositions() throws IOException { // We're looking to make sure that we: Token t1 = new Token(); // Don't count tokens without an increment t1.setPositionIncrement(0); Token t2 = new Token(); t2.setPositionIncrement(1); // Count normal tokens with one increment Token t3 = new Token(); t2.setPositionIncrement(2); // Count funny tokens with more than one increment int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them Token[] tokens = new Token[] {t1, t2, t3}; Collections.shuffle(Arrays.asList(tokens), getRandom()); TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); assertThat(TokenCountFieldMapper.countPositions(tokenStream), equalTo(7)); }
private void fillTokens() throws IOException { final StringBuilder sb = new StringBuilder(); final char[] buffer = new char[256]; while (true) { final int count = input.read(buffer); if (count == -1) { break; } sb.append(buffer, 0, count); // System.out.println("got count=" + count); } // System.out.println("fillTokens: " + sb); inputLength = sb.length(); final String[] parts = sb.toString().split(" "); tokens = new ArrayList<Token>(); int pos = 0; int maxPos = -1; int offset = 0; // System.out.println("again"); for (String part : parts) { final String[] overlapped = part.split("/"); boolean firstAtPos = true; int minPosLength = Integer.MAX_VALUE; for (String part2 : overlapped) { final int colonIndex = part2.indexOf(':'); final String token; final int posLength; if (colonIndex != -1) { token = part2.substring(0, colonIndex); posLength = Integer.parseInt(part2.substring(1 + colonIndex)); } else { token = part2; posLength = 1; } maxPos = Math.max(maxPos, pos + posLength); minPosLength = Math.min(minPosLength, posLength); final Token t = new Token(token, offset, offset + 2 * posLength - 1); t.setPositionLength(posLength); t.setPositionIncrement(firstAtPos ? 1 : 0); firstAtPos = false; // System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + // t.endOffset()); tokens.add(t); } pos += minPosLength; offset = 2 * pos; } assert maxPos <= pos : "input string mal-formed: posLength>1 tokens hang over the end"; }
/** * For languages with canonical form * * @return canonical token (or null if none) */ public Token canonizeToken(Token t) { if (!hasCanonicalFilter) return null; if (lang.equals("sr")) { String nt = new SerbianFilter(null).convert(t.termText()); if (!t.equals(nt)) { Token tt = new Token(nt, t.startOffset(), t.endOffset()); tt.setPositionIncrement(0); tt.setType("alias"); return tt; } } return null; }
/** * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that * the tokens end up at the same position. * * <p>Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in the same * position) Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a * has posInc=n) */ public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) { ArrayList<Token> result = new ArrayList<Token>(); if (lst1 == null || lst2 == null) { if (lst2 != null) result.addAll(lst2); if (lst1 != null) result.addAll(lst1); return result; } int pos = 0; Iterator<Token> iter1 = lst1.iterator(); Iterator<Token> iter2 = lst2.iterator(); Token tok1 = iter1.hasNext() ? iter1.next() : null; Token tok2 = iter2.hasNext() ? iter2.next() : null; int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0; int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0; while (tok1 != null || tok2 != null) { while (tok1 != null && (pos1 <= pos2 || tok2 == null)) { Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type()); tok.copyBuffer(tok1.buffer(), 0, tok1.length()); tok.setPositionIncrement(pos1 - pos); result.add(tok); pos = pos1; tok1 = iter1.hasNext() ? iter1.next() : null; pos1 += tok1 != null ? tok1.getPositionIncrement() : 0; } while (tok2 != null && (pos2 <= pos1 || tok1 == null)) { Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type()); tok.copyBuffer(tok2.buffer(), 0, tok2.length()); tok.setPositionIncrement(pos2 - pos); result.add(tok); pos = pos2; tok2 = iter2.hasNext() ? iter2.next() : null; pos2 += tok2 != null ? tok2.getPositionIncrement() : 0; } } return result; }
private void splitIntoTokens() { String term = termAtt.term(); String[] termParts = splitTerm(term); if (termParts.length > 1) { int termPos = offsetAtt.startOffset(); for (int i = 0; i < termParts.length; i++) { String termPart = termParts[i]; int termPartPos = termPos + term.indexOf(termPart); int termPartEndPos = termPartPos + termPart.length(); Token newToken = new Token(termPart, termPartPos, termPartEndPos); newToken.setPositionIncrement(0); // in the same position tokens.add(newToken); } } }
private static Token token(String term, int posInc, int posLength) { final Token t = new Token(term, 0, 0); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; }