private static Token token( String term, int posInc, int posLength, int startOffset, int endOffset) { final Token t = new Token(term, startOffset, endOffset); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; }
public void testSigSnameFragment() throws Exception { Map<String, String> mapFqn = new HashMap<String, String>(); mapFqn.put("extractSig", "1"); mapFqn.put("shortNamesOnly", "1"); fqnFactory.init(mapFqn); // note .. there is no whitespace here is = new WhitespaceTokenizer( new StringReader( "org.wonderly.ham.speech.DB23Announcements.connected(_UNRESOLVED_.Parameters<?java.util.List<java.lang.String>>,java.lang.String)")); FqnFilter fqnf = fqnFactory.create(is); RemoveSigOrderFilterFactory rsoff = new RemoveSigOrderFilterFactory(); DelimiterFilter rsof = rsoff.create(fqnf); FragmentFilterFactory fff = new FragmentFilterFactory(); FragmentFilter ff = fff.create(rsof); final Token reusableToken = new Token(); Token nextToken = ff.next(reusableToken); assertEquals("Parameters", nextToken.term()); nextToken = ff.next(reusableToken); assertEquals("String", nextToken.term()); assertNull(ff.next(reusableToken)); }
public UserQuery(String uid, String q) throws IOException { setUid(uid); setQuery(q); ArrayList<String> qTerms = new ArrayList<String>(); StandardAnalyzer analyzer = new StandardAnalyzer(); TokenStream stream = analyzer.tokenStream("query", new StringReader(q)); boolean hasTokens = true; while (hasTokens) { Token t = stream.next(); if (t == null) { hasTokens = false; } else { qTerms.add(new String(t.termBuffer(), 0, t.termLength())); } } queryTerms = qTerms.toArray(new String[qTerms.size()]); }
@Override public Token clone() { final Token t = (Token) super.clone(); if (payload != null) { t.payload = payload.clone(); } return t; }
private Token nextToken(Token reusableToken) throws IOException { assert reusableToken != null; // 先使用上次留下来的。 Token nextToken = tokenQueue.poll(); if (nextToken != null) { return nextToken; } /*//在 TokenUtils.nextToken 已经调用了 inc if(!input.incrementToken()) { return null; }*/ /*TermAttribute termAtt = (TermAttribute)input.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute)input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = (TypeAttribute)input.getAttribute(TypeAttribute.class); nextToken = reusableToken.reinit(termAtt.termBuffer(), 0, termAtt.termLength(), offsetAtt.startOffset(), offsetAtt.endOffset(), typeAtt.type());*/ nextToken = TokenUtils.nextToken(input, reusableToken); if (nextToken != null && (Word.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type()) || Word.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type()))) { final char[] buffer = nextToken.buffer(); final int length = nextToken.length(); byte lastType = (byte) Character.getType(buffer[0]); // 与上次的字符是否同类 int termBufferOffset = 0; int termBufferLength = 0; for (int i = 0; i < length; i++) { byte type = (byte) Character.getType(buffer[i]); if (type <= Character.MODIFIER_LETTER) { type = Character.LOWERCASE_LETTER; } if (type != lastType) { // 与上一次的不同 addToken(nextToken, termBufferOffset, termBufferLength, lastType); termBufferOffset += termBufferLength; termBufferLength = 0; lastType = type; } termBufferLength++; } if (termBufferLength > 0) { // 最后一次 addToken(nextToken, termBufferOffset, termBufferLength, lastType); } nextToken = tokenQueue.poll(); } return nextToken; }
private void fillTokens() throws IOException { final StringBuilder sb = new StringBuilder(); final char[] buffer = new char[256]; while (true) { final int count = input.read(buffer); if (count == -1) { break; } sb.append(buffer, 0, count); // System.out.println("got count=" + count); } // System.out.println("fillTokens: " + sb); inputLength = sb.length(); final String[] parts = sb.toString().split(" "); tokens = new ArrayList<Token>(); int pos = 0; int maxPos = -1; int offset = 0; // System.out.println("again"); for (String part : parts) { final String[] overlapped = part.split("/"); boolean firstAtPos = true; int minPosLength = Integer.MAX_VALUE; for (String part2 : overlapped) { final int colonIndex = part2.indexOf(':'); final String token; final int posLength; if (colonIndex != -1) { token = part2.substring(0, colonIndex); posLength = Integer.parseInt(part2.substring(1 + colonIndex)); } else { token = part2; posLength = 1; } maxPos = Math.max(maxPos, pos + posLength); minPosLength = Math.min(minPosLength, posLength); final Token t = new Token(token, offset, offset + 2 * posLength - 1); t.setPositionLength(posLength); t.setPositionIncrement(firstAtPos ? 1 : 0); firstAtPos = false; // System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + // t.endOffset()); tokens.add(t); } pos += minPosLength; offset = 2 * pos; } assert maxPos <= pos : "input string mal-formed: posLength>1 tokens hang over the end"; }
/** stem a collection of words */ public ArrayList<String> stem(Collection<String> list) { if (!hasStemmer()) return new ArrayList<String>(); ArrayList<String> ret = new ArrayList<String>(); TokenStream ts = makeStemmer(new StringsTokenStream(list)); try { Token t; while ((t = ts.next()) != null) ret.add(t.termText()); return ret; } catch (IOException e) { e.printStackTrace(); return new ArrayList<String>(); } }
/** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Text: "); String line = in.readLine(); Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line)); Token token; System.out.print("Tokens: "); while ((token = tokenizer.next()) != null) { System.out.print(token.termText()); System.out.print(" "); } System.out.println(); } }
/** check if two words have same stemmed variants */ public boolean stemsToSame(String word1, String word2) { if (!hasStemmer()) return false; ArrayList<String> in = new ArrayList<String>(); in.add(word1); in.add(word2); TokenStream ts = makeStemmer(new StringsTokenStream(in)); try { Token t1 = ts.next(); Token t2 = ts.next(); if (t1 != null && t2 != null && t1.termText().equals(t2.termText())) return true; } catch (IOException e) { e.printStackTrace(); } return false; }
public void testFqnShortName3() throws Exception { Map<String, String> mapFqn = new HashMap<String, String>(); mapFqn.put("extractSig", "0"); mapFqn.put("shortNamesOnly", "1"); fqnFactory.init(mapFqn); // note .. there is no whitespace here is = new WhitespaceTokenizer(new StringReader("connected")); FqnFilter tokenizer = fqnFactory.create(is); final Token reusableToken = new Token(); Token nextToken = tokenizer.next(reusableToken); assertEquals("connected", nextToken.term()); assertNull(tokenizer.next(reusableToken)); }
public Token next() throws IOException { Token candidate; while ((candidate = baseTokeniser.next()) != null) { try { Integer integer = Integer.valueOf(candidate.termText()); String valueString = NumericEncoder.encode(integer.intValue()); Token integerToken = new Token( valueString, candidate.startOffset(), candidate.startOffset(), candidate.type()); return integerToken; } catch (NumberFormatException e) { // just ignore and try the next one } } return null; }
@Test public void testCountPositions() throws IOException { // We're looking to make sure that we: Token t1 = new Token(); // Don't count tokens without an increment t1.setPositionIncrement(0); Token t2 = new Token(); t2.setPositionIncrement(1); // Count normal tokens with one increment Token t3 = new Token(); t2.setPositionIncrement(2); // Count funny tokens with more than one increment int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them Token[] tokens = new Token[] {t1, t2, t3}; Collections.shuffle(Arrays.asList(tokens), getRandom()); TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); assertThat(TokenCountFieldMapper.countPositions(tokenStream), equalTo(7)); }
private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) { Token token = new Token( oriToken.buffer(), termBufferOffset, termBufferLength, oriToken.startOffset() + termBufferOffset, oriToken.startOffset() + termBufferOffset + termBufferLength); if (type == Character.DECIMAL_DIGIT_NUMBER) { token.setType(Word.TYPE_DIGIT); } else { token.setType(Word.TYPE_LETTER); } tokenQueue.offer(token); }
/** * Does this match? * * @param token The current token * @param no The token number * @param stack The fork stack... may be used if there are partial matches * @param lookBackStack The reverse stack (ignored) */ public TypeExpr matches(Token token, int no, Stack<MatchFork> stack, List<Token> lookBackStack) { if (matches == null) { if (set) { matches = new TreeSet<WordListEntry>(); WordListSet wls = WordListSet.getWordListSetByName(wordListName); if (wls == null) { throw new IllegalArgumentException("Cannot find word list set %" + wordListName); } for (Map.Entry<String, WordList> entry : wls.getWordListSets()) { matches.addAll(WordListSet.getMatchSet(entry.getKey(), token.termText().toLowerCase())); } // currentMatch = wls.getEntry(token.termText().toLowerCase()); currentMatch = new WordListEntry(new LinkedList<String>()); currentMatch.addWord(token.term().toLowerCase()); } else { matches = new TreeSet<WordListEntry>( WordListSet.getMatchSet(wordListName, token.termText().toLowerCase())); // currentMatch = // WordListSet.getWordListSetByList(wordListName).getEntry(token.termText().toLowerCase()); currentMatch = new WordListEntry(new LinkedList<String>()); currentMatch.addWord(token.term().toLowerCase()); } } else { currentMatch.addWord(token.termText().toLowerCase()); } MatchFork mf = MatchFork.find(stack, no, this); if (mf != null && (mf.used == true || stack.peek() == mf)) { stack.peek().split(no, this); return this; } Iterator<WordListEntry> wleIter = matches.iterator(); while (wleIter.hasNext()) { WordListEntry wle = wleIter.next(); if (wle.equals(currentMatch)) { if (matches.size() > 1 && (stack.empty() || stack.peek().tokenNo < no)) stack.push(new MatchFork(no, this)); return next; } if (!wle.matchable(currentMatch)) wleIter.remove(); } if (matches.isEmpty()) return null; else return this; }
public void testFqnShortName() throws Exception { Map<String, String> mapFqn = new HashMap<String, String>(); mapFqn.put("extractSig", "0"); mapFqn.put("shortNamesOnly", "1"); fqnFactory.init(mapFqn); // note .. there is no whitespace here is = new WhitespaceTokenizer( new StringReader( "org.wonderly.ham.speech.DB23Announcements.connected(_UNRESOLVED_.Parameters<?java.util.List<java.lang.String>>,java.lang.String)")); FqnFilter tokenizer = fqnFactory.create(is); final Token reusableToken = new Token(); Token nextToken = tokenizer.next(reusableToken); assertEquals("connected", nextToken.term()); assertNull(tokenizer.next(reusableToken)); }
private void splitIntoTokens() { String term = termAtt.term(); String[] termParts = splitTerm(term); if (termParts.length > 1) { int termPos = offsetAtt.startOffset(); for (int i = 0; i < termParts.length; i++) { String termPart = termParts[i]; int termPartPos = termPos + term.indexOf(termPart); int termPartEndPos = termPartPos + termPart.length(); Token newToken = new Token(termPart, termPartPos, termPartEndPos); newToken.setPositionIncrement(0); // in the same position tokens.add(newToken); } } }
public String tokens(String field) { try { Field f = doc.getField(field); if (f == null) fail("No such field " + field); if (!f.isTokenized()) { String val = value(field); Token t = new Token(val, 0, val.length()); return t.getPositionIncrement() + " [" + t.termText() + "]"; } TokenStream ts = f.tokenStreamValue(); if (ts == null && f.stringValue() != null) ts = analyzer.tokenStream(field, f.stringValue()); if (ts == null && f.readerValue() != null) ts = analyzer.tokenStream(field, f.readerValue()); if (ts == null) fail("No token stream for field " + field); Token t = null; StringBuilder sb = new StringBuilder(); while ((t = ts.next()) != null) { sb.append(t.getPositionIncrement() + " [" + t.termText() + "] "); } return sb.toString().trim(); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); return null; } }
public void testMethodSname() throws Exception { Map<String, String> mapFqn = new HashMap<String, String>(); mapFqn.put("extractSig", "0"); mapFqn.put("shortNamesOnly", "1"); fqnFactory.init(mapFqn); // note .. there is no whitespace here is = new WhitespaceTokenizer( new StringReader("org.wonderly.ham.speech.DB23Announcements.connected()")); FqnFilter ff = fqnFactory.create(is); // RemoveSigOrderFilterFactory rsoff = new RemoveSigOrderFilterFactory(); // DelimiterFilter rsof = rsoff.create(fqnf); // FragmentFilterFactory fff = new FragmentFilterFactory(); // FragmentFilter ff = fff.create(rsof); final Token reusableToken = new Token(); Token nextToken = ff.next(reusableToken); assertEquals("connected", nextToken.term()); assertNull(ff.next(reusableToken)); }
/** * @param input * @param reusableToken is null well new one auto. * @return null - if not next token or input is null. * @throws IOException */ public static Token nextToken(TokenStream input, Token reusableToken) throws IOException { if (input == null) { return null; } if (!input.incrementToken()) { return null; } CharTermAttribute termAtt = (CharTermAttribute) input.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = (TypeAttribute) input.getAttribute(TypeAttribute.class); if (reusableToken == null) { reusableToken = new Token(); } reusableToken.clear(); if (termAtt != null) { // lucene 3.0 // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength()); // lucene 3.1 reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length()); } if (offsetAtt != null) { // lucene 3.1 // reusableToken.setStartOffset(offsetAtt.startOffset()); // reusableToken.setEndOffset(offsetAtt.endOffset()); // lucene 4.0 reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); } if (typeAtt != null) { reusableToken.setType(typeAtt.type()); } return reusableToken; }
/** * For languages with canonical form * * @return canonical token (or null if none) */ public Token canonizeToken(Token t) { if (!hasCanonicalFilter) return null; if (lang.equals("sr")) { String nt = new SerbianFilter(null).convert(t.termText()); if (!t.equals(nt)) { Token tt = new Token(nt, t.startOffset(), t.endOffset()); tt.setPositionIncrement(0); tt.setType("alias"); return tt; } } return null; }
@Override public boolean incrementToken() throws IOException { if (index >= tokens.length) return false; else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } }
public boolean incrementToken() throws IOException { clearAttributes(); Token token = nextToken(reusableToken); if (token != null) { termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); return true; } else { end(); return false; } }
@Override public boolean incrementToken() { if (upto < tokens.length) { final Token token = tokens[upto++]; // TODO: can we just capture/restoreState so // we get all attrs...? clearAttributes(); termAtt.setEmpty(); termAtt.append(token.toString()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); posLengthAtt.setPositionLength(token.getPositionLength()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); payloadAtt.setPayload(token.getPayload()); return true; } else { return false; } }
public final Token next() throws IOException { Token localToken = this.input.next(); if (localToken == null) return null; String str1 = localToken.termText(); String str2 = localToken.type(); if ((str2 == APOSTROPHE_TYPE) && ((str1.endsWith("'s")) || (str1.endsWith("'S")))) return new Token( str1.substring(0, str1.length() - 2), localToken.startOffset(), localToken.endOffset(), str2); if (str2 == ACRONYM_TYPE) { StringBuffer localStringBuffer = new StringBuffer(); for (int i = 0; i < str1.length(); i++) { char c = str1.charAt(i); if (c != '.') localStringBuffer.append(c); } return new Token( localStringBuffer.toString(), localToken.startOffset(), localToken.endOffset(), str2); } return localToken; }
@Override public boolean incrementToken() throws IOException { if (tokens == null) { fillTokens(); } // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); if (upto == tokens.size()) { // System.out.println(" END @ " + tokens.size()); return false; } final Token t = tokens.get(upto++); // System.out.println(" return token=" + t); clearAttributes(); termAtt.append(t.toString()); offsetAtt.setOffset(t.startOffset(), t.endOffset()); posIncrAtt.setPositionIncrement(t.getPositionIncrement()); posLengthAtt.setPositionLength(t.getPositionLength()); return true; }
/** * Copy the prototype token's fields into this one. Note: Payloads are shared. * * @param prototype source Token to copy fields from */ public void reinit(Token prototype) { // this is a bad hack to emulate no cloning of payload! prototype.copyToWithoutPayloadClone(this); }
public Token next(Token reusableToken) throws IOException { Token token = reusableToken; if (inPhrase) { inPhrase = false; token.setTermBuffer("phrase2"); token.setStartOffset(savedStart); token.setEndOffset(savedEnd); return reusableToken; } else while ((token = this.input.next(reusableToken)) != null) { if (token.term().equals("phrase")) { inPhrase = true; savedStart = token.startOffset(); savedEnd = token.endOffset(); token.setTermBuffer("phrase1"); token.setStartOffset(savedStart); token.setEndOffset(savedEnd); return token; } else if (!token.term().equals("stop")) return token; } return null; }
// 형태소 분석한 결과의 HTML 코드를 String으로 리턴한다. public String Anaylze(String order) { String html = ""; try { if (!"".equals(order)) { log.info(order); long start = System.currentTimeMillis(); html += "<p></p><div style='font-size:12pt;color:green'> 형태소분석 : " + order + "</div><p></p>"; // 형태소분석을 하기 전에 db에 있는 명사들을 찾아낸다. NameFinder nf = new NameFinder(); nf.CreateMap(); order = nf.Find(order); // 형태소분석기 클래스 생성 MorphAnalyzer analyzer = new MorphAnalyzer(); KoreanTokenizer tokenizer = new KoreanTokenizer(new StringReader(order)); Token token = null; // 문장 클래스 생성 Sentence sentence = new Sentence(order); sentence.SetKeyword(keyword); // '난 학생이다.'를 입력하면 token에서 // '난' / '학생이다'로 나뉜다. while ((token = tokenizer.next()) != null) { html += "<div class='outer'>"; // 1. 한글과 영어, 숫자를 나눈다. 즉 토큰을 새로 짠다. // 영어, 숫자는 "7글자"부터 에러 // 한글도 "8글자"부터는 에러남. // 특수문자는 무시 try { // token.termText()를 하면 다음 토큰을 출력. String tokenText = token.termText(); // analyzer.setExactCompound(false); // 각각의 토큰에 대한 results 리스트. List<AnalysisOutput> results = analyzer.analyze(tokenText); // 문장에 어절 추가. sentence.addWord(tokenText, results.get(0).toString(), numKeyword); numKeyword = sentence.GetKeyword(); // 각각의 results에 대한 결과값 출력. // for( AnalysisOutput o : results ) { /*html += "<div class='inner' style='font-size:10pt; color:blue'>"; html += o.toString(); html += "->"; for( int i = 0 ; i < o.getCNounList().size() ; i++ ) { html += o.getCNounList().get(i).getWord(); html += "/"; } html += "<" + o.getScore() + ">"; html += "</div>"; html += "<div class='inner' style='font-size:10pt;'>getDinf : " + o.getDinf(); // 어미 html += "<br>getEomi (어미) : " + o.getEomi(); // 어미. getEomi().charAt(0) 하면 어미의 첫음절 음절정보 // 어미로 끝나는 경우 분석 //if(eomiFlag) { //analysisWithEomi(stem,eomi,candidates); //} // 조사 // 조사로 끝나는 경우 분석 //if(josaFlag&&feature[SyllableUtil.IDX_JOSA1]=='1') { //analysisWithJosa(stem,eomi,candidates); //} html += "<br>getJosa (조사) : " + o.getJosa(); // 패턴. PatternConstant // 체언 : 조사의 도움을 받아 문장에서 주체의 구실을 하는 단어. 명사, 대명사, 수사 // PTN_N: 체언 (사랑) 1 // PTN_NJ: 체언 + 조사 (사랑은) 2 // PTN_NSM: 체언 + 용언화접미사 + 어미 (사랑받고) 3 // PTN_NSMJ: 체언 + 용언화접미사 + 명사화접미사('음/기') + 조사 (사랑받기가) 4 // PTN_NSMXM: 체언 + 용언화접미사 + '아/어' + 보조용언 + 어미 (사랑받아보다) 5 // PTN_NJCM: 체언 + '에서/부터/에서부터' + '이' + 어미 (처음부터이다) // 용언 : 독립된 뜻을 가지고 어미를 활용하여 문장성분으로서 서술어의 기능을 하는 말. // 서술어의 기능을 하는 곳. 동사, 형용사가 있음 // PTN_VM : 용언 + 어미 (돕다) 11 // PTN_VMJ: 용언 + 명사화접미사('음/기') + 조사 (돕기가) 12 // PTN_VMCM: 용언 + 명사화접미사('음/기') + '이' + 어미 (도움이다) // PTN_VMXM: 용언 + '아/어' + 보조용언 + 어미 (도와주다) // PTN_VMXMJ: 용언 + '아/어' + 보조용언 + 명사화접미사('음/기') + 조사 (도와주기가) // PTN_AID: 단일어 : 부사, 관형사, 감탄사 (그러나) 21 // PTN_ADVJ: 부사 + 조사 (빨리도) String a = ""; if( o.getPatn() == 1 ) a = "PTN_N 체언"; if( o.getPatn() == 2 ) a = "PTN_NJ 체언 + 조사"; if( o.getPatn() == 3 ) a = "PTN_NSM 체언 + 용언화접미사 + 어미"; if( o.getPatn() == 4 ) a = "PTN_NSMJ 체언 + 용언화접미사 + 명사화접미사('음/기') + 조사"; if( o.getPatn() == 5 ) a = "PTN_NSMXM 체언 + 용언화접미사 + '아/어' + 보조용언 + 어미"; if( o.getPatn() == 6 ) a = "PTN_NJCM 체언 + '에서/부터/에서부터' + '이' + 어미"; if( o.getPatn() == 11 ) a = "PTN_VM 용언 + 어미"; if( o.getPatn() == 12 ) a = "PTN_VMJ 용언 + 명사화접미사('음/기') + 조사"; if( o.getPatn() == 13 ) a = "PTN_VMCM 용언 + 명사화접미사('음/기') + '이' + 어미"; if( o.getPatn() == 14 ) a = "PTN_VMXM 용언 + '아/어' + 보조용언 + 어미"; if( o.getPatn() == 15 ) a = "PTN_VMXMJ 용언 + '아/어' + 보조용언 + 명사화접미사('음/기') + 조사"; if( o.getPatn() == 21 ) a = "PTN_AID 단일어 : 부사, 관형사, 감탄사"; if( o.getPatn() == 22 ) a = "PTN_ADVJ 부사 + 조사"; html += "<br>getPatn (패턴) : " + a + " (" + o.getPatn() + ")"; // ~들 html += "<br>getNsfx (~들) : " + o.getNsfx(); // html += "<br>getPomi : " + o.getPomi(); // html += "<br>getPos : " + o.getPos(); // html += "<br>getPos2 : " + o.getPos2(); // 점수 html += "<br>getScore (점수) : " + o.getScore(); // html += "<br>getSource : " + o.getSource(); // 어근 html += "<br>getStem (어근) : " + o.getStem(); // html += "<br>getType : " + o.getType(); // 용언화접미사 html += "<br>getVtype (용언화접미사) : " + o.getVtype(); // 보조용언 html += "<br>getXverb (보조용언) : " + o.getXverb(); // 명사 리스트 html += "<br>CNounList (명사리스트) .Size = " + o.getCNounList().size(); for( int i = 0 ; i < o.getCNounList().size() ; i++ ) { html += "<br>getCNounList-" + i + " : " + o.getCNounList().get(i).getWord(); } // 명사화접미사('음/기') // 보조적연결어미('아/어') html += "<br>EList (명사화접미사(음/기), 보조적연결어미(아/어).Size = " + o.getElist().size(); for( int i = 0 ; i < o.getElist().size() ; i++ ) { html += "<br>getElist-" + i + " : " + o.getElist().get(i); } // html += "<br>JList.Size = " + o.getJlist().size(); for( int i = 0 ; i < o.getJlist().size() ; i++ ) { html += "<br>getJlist-" + i + " : " + o.getJlist().get(i); } html += "</div>";*/ // } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); } html += "</div>"; } // 수행시간 계산 html += "<p><div>" + (System.currentTimeMillis() - start) + "ms</div></p>"; // 출력.. String printOut = ""; for (int i = 0; i < sentence.getWord().size(); i++) { html += "<div class='inner' style='font-size:10pt; color:red'>"; for (int j = 0; j < sentence.getWord(i).getLemma().size(); j++) { printOut += sentence.getWord(i).getLemma(j).getText() + "/" + sentence.getWord(i).getLemma(j).getPos(); if (j != sentence.getWord(i).getLemma().size() - 1) printOut += "+"; } if (i != sentence.getWord().size() - 1) printOut += "_"; } html += printOut; // 의미분석 결과 (정호) -> 쿼리로 리턴됨. ResultAnalyzer ra = new ResultAnalyzer(); query = ra.Analyze(sentence); html += "</div>"; } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); } return html; }
/** * Returns the next token in the stream, or null at EOS. * * <p>Removes <tt>'s</tt> from the end of words. * * <p>Removes dots from acronyms. * * <p>Splits host names ... */ public final org.apache.lucene.analysis.Token next() throws java.io.IOException { if (hostTokens == null) { org.apache.lucene.analysis.Token t = input.next(); if (t == null) return null; String text = t.termText(); String type = t.type(); if (type == APOSTROPHE_TYPE && // remove 's (text.endsWith("'s") || text.endsWith("'S"))) { return new org.apache.lucene.analysis.Token( text.substring(0, text.length() - 2), t.startOffset(), t.endOffset(), type); } else if (type == ACRONYM_TYPE) { // remove dots StringBuffer trimmed = new StringBuffer(); for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (c != '.') trimmed.append(c); } return new org.apache.lucene.analysis.Token( trimmed.toString(), t.startOffset(), t.endOffset(), type); } else if (type == HOST_TYPE) { // <HOST: <ALPHANUM> ("." <ALPHANUM>)+ > // There must be at least two tokens .... hostTokens = new LinkedList<org.apache.lucene.analysis.Token>(); StringTokenizer tokeniser = new StringTokenizer(text, "."); int start = t.startOffset(); int end; while (tokeniser.hasMoreTokens()) { String token = tokeniser.nextToken(); end = start + token.length(); hostTokens.offer(new org.apache.lucene.analysis.Token(token, start, end, ALPHANUM_TYPE)); start = end + 1; } // check if we have an acronym ..... yes a.b.c ends up here ... if (text.length() == hostTokens.size() * 2 - 1) { hostTokens = null; // acronym StringBuffer trimmed = new StringBuffer(); for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (c != '.') trimmed.append(c); } return new org.apache.lucene.analysis.Token( trimmed.toString(), t.startOffset(), t.endOffset(), ALPHANUM_TYPE); } else { return hostTokens.remove(); } } else { return t; } } else { org.apache.lucene.analysis.Token token = hostTokens.remove(); if (hostTokens.isEmpty()) { hostTokens = null; } return token; } }
private void applyToken(Token token) { termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength()); posAtt.setPositionIncrement(token.getPositionIncrement()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); }