private static String[] groupTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<TermInfo> infos = new ArrayList<TermInfo>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); TermInfo info = new TermInfo(); info.setStart(startOffset); info.setEnd(endOffset); infos.add(info); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); Stack<TermInfo> tiStack = groupTokenInfos(infos); List<String> terms = new ArrayList<String>(); while (!tiStack.isEmpty()) { TermInfo termInfo = tiStack.pop(); if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) { String term = input.substring(termInfo.getStart(), termInfo.getEnd()); terms.add(term); } } return terms.toArray(new String[] {}); }
private static Stack<TermInfo> groupTokenInfos(List<TermInfo> infos) { List<TermInfo> tis = new ArrayList<TermInfo>(); for (TermInfo info : infos) { Integer len = info.getEnd() - info.getStart(); if (len == 1) { tis.add(info); } } Stack<TermInfo> tiStack = new Stack<TermInfo>(); for (Integer index = 0; index < tis.size(); index++) { TermInfo info = tis.get(index); TermInfo plus = info; for (Integer next = index + 1; next < tis.size(); next++) { TermInfo nextInfo = tis.get(next); if (plus == null) { plus = nextInfo; continue; } plus = plus.plus(nextInfo); if (plus != null) { if (tiStack.isEmpty()) { tiStack.push(plus); } else if (!tiStack.contains(plus)) { tiStack.push(plus); } } } } return tiStack; }