private static String[] groupTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<TermInfo> infos = new ArrayList<TermInfo>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); TermInfo info = new TermInfo(); info.setStart(startOffset); info.setEnd(endOffset); infos.add(info); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); Stack<TermInfo> tiStack = groupTokenInfos(infos); List<String> terms = new ArrayList<String>(); while (!tiStack.isEmpty()) { TermInfo termInfo = tiStack.pop(); if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) { String term = input.substring(termInfo.getStart(), termInfo.getEnd()); terms.add(term); } } return terms.toArray(new String[] {}); }
private static String[] mmsegTokens(Analyzer analyzer, String input) throws IOException { if (Resources.debug) { Resources.LOGGER.debug("TokenParser:" + input); Resources.LOGGER.debug("Analyzer:" + analyzer.getClass()); } TokenStream tokenStream = analyzer.tokenStream("input", input); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; if (Resources.debug) { Resources.LOGGER.debug(position + ":"); } } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); tokens.add(term); if (Resources.debug) { Resources.LOGGER.debug( "[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type()); } } tokenStream.end(); tokenStream.close(); return tokens.toArray(new String[] {}); }