static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer) throws IOException { List<String> terms = new ArrayList<String>(); // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the // type. if (localText == null) { throw new SearchException( "Search parameter on field " + fieldName + " could not be converted. " + "Are the parameter and the field of the same type?" + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters"); } Reader reader = new StringReader(localText); TokenStream stream = analyzer.reusableTokenStream(fieldName, reader); TermAttribute attribute = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (attribute.termLength() > 0) { String term = attribute.term(); terms.add(term); } } stream.end(); stream.close(); return terms; }
/** * 查看分词后的语汇单元细节,只打印分词结果 * * @param analyzer * @param text * @throws IOException */ public static void displaySimpleTokens(Analyzer analyzer, String text) throws IOException { TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(text)); TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { System.out.print(termAttribute.term() + ","); } System.out.println(); }
private String analyzeQuery(String query) throws IOException { StringBuilder result = new StringBuilder(); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query))); TermAttribute termAttribute = filter.getAttribute(TermAttribute.class); while (filter.incrementToken()) { result.append(termAttribute.term()).append("* "); } return result.toString(); }
@Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { char text[] = termAtt.termBuffer(); int termLength = termAtt.termLength(); if (!stopTable.contains(text, 0, termLength)) { return true; } } return false; }
public boolean incrementToken() throws IOException { if (!input.incrementToken()) // Advance to next token return false; // When false, end has been reached String encoded; encoded = metaphoner.encode(termAttr.term()); // Convert term text to // Metaphone encoding termAttr.setTermBuffer(encoded); // Overwrite term text with encoded // text typeAttr.setType(METAPHONE); // Set token type return true; }
public final boolean incrementToken() throws IOException { int increment = 0; while (input.incrementToken()) { if (!stopWords.contains(termAttr.termBuffer(), 0, termAttr.termLength())) { posIncrAttr.setPositionIncrement(posIncrAttr.getPositionIncrement() + increment); return true; } increment += posIncrAttr.getPositionIncrement(); } return false; }
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { System.out.println("Hello"); KoreanAnalyzer ka = new KoreanAnalyzer(); TokenStream ts = ka.tokenStream("", new java.io.StringReader("과학기술이 정말 I an Hello")); System.out.println(ts.toString()); try{ while (ts.incrementToken()){ org.apache.lucene.analysis.tokenattributes.TermAttribute ta = ts.getAttribute( org.apache.lucene.analysis.tokenattributes.TermAttribute.class); System.out.println("adf"+ta.term()); } }catch (Exception e){System.out.println(e.toString());} }
private void dumpSpans(SpanQuery query) throws IOException { Spans spans = query.getSpans(reader); System.out.println(query + ":"); int numSpans = 0; TopDocs hits = searcher.search(query, 10); float[] scores = new float[2]; for (ScoreDoc sd : hits.scoreDocs) { scores[sd.doc] = sd.score; } while (spans.next()) { // A numSpans++; int id = spans.doc(); Document doc = reader.document(id); // B TokenStream stream = analyzer.tokenStream( "contents", // C new StringReader(doc.get("f"))); // C TermAttribute term = stream.addAttribute(TermAttribute.class); StringBuilder buffer = new StringBuilder(); buffer.append(" "); int i = 0; while (stream.incrementToken()) { // D if (i == spans.start()) { // E buffer.append("<"); // E } // E buffer.append(term.term()); // E if (i + 1 == spans.end()) { // E buffer.append(">"); // E } // E buffer.append(" "); i++; } buffer.append("(").append(scores[id]).append(") "); System.out.println(buffer); } if (numSpans == 0) { System.out.println(" No spans"); } System.out.println(); }
public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; clearAttributes(); termAtt.setTermBuffer("phrase2"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else while (input.incrementToken()) { if (termAtt.term().equals("phrase")) { inPhrase = true; savedStart = offsetAtt.startOffset(); savedEnd = offsetAtt.endOffset(); termAtt.setTermBuffer("phrase1"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else if (!termAtt.term().equals("stop")) return true; } return false; }
@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; char[] termBuffer = termAtt.termBuffer(); int termBufferLength = termAtt.termLength(); char[] backup = null; if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) { // make a backup in case we exceed the word count backup = new char[termBufferLength]; System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); } if (termBufferLength < factory.maxTokenLength) { int wordCount = 0; int lastWordStart = 0; for (int i = 0; i < termBufferLength; i++) { char c = termBuffer[i]; if (c <= ' ' || c == '.') { int len = i - lastWordStart; if (len > 0) { factory.processWord(termBuffer, lastWordStart, len, wordCount++); lastWordStart = i + 1; i++; } } } // process the last word if (lastWordStart < termBufferLength) { factory.processWord( termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); } if (wordCount > factory.maxWordCount) { termAtt.setTermBuffer(backup, 0, termBufferLength); } } return true; }
/* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); int posIncr = 1; while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.' } else { typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); } } else { typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } return true; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
/** * 查看分析器生成的语汇单元细节 * * @param analyzer * @param text * @throws IOException */ public static void displayTokens(Analyzer analyzer, String text) throws IOException { // 语汇单元流 TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(text)); // 获取语汇单元的属性 TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); // 位置增量,在短语查询的时候,同义词查询的时候有作用 PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); // 偏移量,高亮查询匹配结果有用 OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); // 语汇单元类型,普通是word,还有email等 TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); int position = 0; while (tokenStream.incrementToken()) { // 计算位置信息 int increment = positionIncrementAttribute.getPositionIncrement(); if (increment > 0) { position = position + increment; } // 打印所有语汇单元的细节信息 System.out.println( "position : " + position + " [" + termAttribute.term() + ":" + offsetAttribute.startOffset() + "->" + offsetAttribute.endOffset() + ":" + typeAttribute.type() + "]"); } }
private void splitIntoTokens() { String term = termAtt.term(); String[] termParts = splitTerm(term); if (termParts.length > 1) { int termPos = offsetAtt.startOffset(); for (int i = 0; i < termParts.length; i++) { String termPart = termParts[i]; int termPartPos = termPos + term.indexOf(termPart); int termPartEndPos = termPartPos + termPart.length(); Token newToken = new Token(termPart, termPartPos, termPartEndPos); newToken.setPositionIncrement(0); // in the same position tokens.add(newToken); } } }
/** * Parses the file to extract all the words for indexing and some data characterizing the file. * * @param file contains the fullpath of the document to parse * @param indexerLanguage this will be used to tell the program which stemmer to be used. * @param stem if true then generate js files with words stemmed * @return a DitaFileInfo object filled with data describing the file */ public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) { // initialization fileDesc = new DocFileInfo(file); strbf = new StringBuffer(""); // Fill strbf by parsing the file parseDocument(file); String str = cleanBuffer(strbf); str = str.replaceAll("\\s+", " "); // there's still redundant spaces in the middle // System.out.println(file.toString()+" "+ str +"\n"); // START OXYGEN PATCH // String[] items = str.split("\\s"); //contains all the words in the array // END OXYGEN PATCH // get items one-by-one, tunnel through the stemmer, and get the stem. // Then, add them to tempSet // Do Stemming for words in items // TODO currently, stemming support is for english and german only. Add support for other // languages as well. // START OXYGEN PATCH wsList = new ArrayList<WordAndScoring>(); // START OXYGEN PATCH, create the words and scoring list // String[] tokenizedItems; // END OXYGEN PATCH if (indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh") || indexerLanguage.equalsIgnoreCase("ko")) { LinkedList<String> tokens = new LinkedList<String>(); try { // EXM-21501 Oxygen patch, replace the extra "@@@"s. str = str.replaceAll("@@@([^\\s]*)@@@", ""); CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30); Reader reader = new StringReader(str); TokenStream stream = analyzer.tokenStream("", reader); TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); OffsetAttribute offAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class); while (stream.incrementToken()) { String term = termAtt.term(); tokens.add(term); WordAndScoring ws = new WordAndScoring(term, term, 1); boolean found = false; for (WordAndScoring aWsList : wsList) { // If the stem of the current word is already in list, // do not add the word in the list, just recompute scoring if (aWsList.getStem().equals(ws.getStem())) { found = true; int scoring = aWsList.getScoring(); aWsList.setScoring(scoring + ws.getScoring()); break; } } if (!found) { wsList.add(ws); } } // START OXYGEN PATCH // tokenizedItems = tokens.toArray(new String[tokens.size()]); // END OXYGEN PATCH } catch (IOException ex) { // START OXYGEN PATCH // tokenizedItems = items; // END OXYGEN PATCH System.out.println("Error tokenizing content using CJK Analyzer. IOException"); ex.printStackTrace(); } } else { SnowballStemmer stemmer; if (indexerLanguage.equalsIgnoreCase("en")) { stemmer = new EnglishStemmer(); } else if (indexerLanguage.equalsIgnoreCase("de")) { stemmer = new GermanStemmer(); } else if (indexerLanguage.equalsIgnoreCase("fr")) { stemmer = new FrenchStemmer(); } else { stemmer = null; // Languages which stemming is not yet supported.So, No stemmers will be used. } // START OXYGEN PATCH wsList = new ArrayList<WordAndScoring>(); StringTokenizer st = new StringTokenizer(str, " "); // Tokenize the string and populate the words and scoring list while (st.hasMoreTokens()) { String token = st.nextToken(); WordAndScoring ws = getWordAndScoring(token, stemmer, stem); if (ws != null) { boolean found = false; for (WordAndScoring aWsList : wsList) { // If the stem of the current word is already in list, // do not add the word in the list, just recompute scoring if (aWsList.getStem().equals(ws.getStem())) { found = true; int scoring = aWsList.getScoring(); aWsList.setScoring(scoring + ws.getScoring()); break; } } if (!found) { wsList.add(ws); } } } // if(stemmer != null) //If a stemmer available // tokenizedItems = stemmer.doStem(items.toArray(new String[0])); // else //if no stemmer available for the particular // language // tokenizedItems = items.toArray(new String[0]); // END OXYGEN PATCH } /* for(String stemmedItem: tokenizedItems){ System.out.print(stemmedItem+"| "); }*/ // START OXYGEN PATCH // //items: remove the duplicated strings first // HashSet <String> tempSet = new HashSet<String>(); // tempSet.addAll(Arrays.asList(tokenizedItems)); // Iterator it = tempSet.iterator(); // Iterate over the words and scoring list Iterator<WordAndScoring> it = wsList.iterator(); WordAndScoring s; while (it.hasNext()) { s = it.next(); // Do not add results from 'toc.html' if (s != null && tempDico.containsKey(s.getStem())) { String temp = tempDico.get(s.getStem()); temp = temp.concat(",") .concat(Integer.toString(i)) // Concat also the scoring for the stem .concat("*") .concat(Integer.toString(s.getScoring())); // System.out.println("temp="+s+"="+temp); tempDico.put(s.getStem(), temp); } else if (s != null) { String temp = null; temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring())); tempDico.put(s.getStem(), temp); } // END OXYGEN PATCH } i++; return fileDesc; }
private void applyToken(Token token) { termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength()); posAtt.setPositionIncrement(token.getPositionIncrement()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); }
public static void assertTokenStreamContents( TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { assertNotNull(output); CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute) ts.addAttribute(CheckClearAttributesAttribute.class); assertTrue("has no TermAttribute", ts.hasAttribute(TermAttribute.class)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { assertTrue( "has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class); } ts.reset(); for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.clearAttributes(); termAtt.setTermBuffer("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before assertTrue("token " + i + " does not exist", ts.incrementToken()); assertTrue( "clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term " + i, output[i], termAtt.term()); if (startOffsets != null) assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset()); if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset()); if (types != null) assertEquals("type " + i, types[i], typeAtt.type()); if (posIncrements != null) assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement()); } assertFalse("end of stream", ts.incrementToken()); ts.end(); if (finalOffset != null) assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); ts.close(); }
@SuppressWarnings("unchecked") private void sub_search(String searchQuery) throws IOException, ParseException { searcher = new Searcher(indexDir); int numSearched = 0; // 찾아진 총 문서 수 ArrayList<String> doc_list = new ArrayList<String>(); TopDocs hits; Document doc; long startTime = System.currentTimeMillis(); // 모든 검색 결과와 정보들은 ForSend_json에 json형태로 저장해서 out.txt에 저장된다. // * 메인 쿼리에 대한 검색 *// JSONObject sub_query = new JSONObject(); HashMap<String, Integer> map = new HashMap<String, Integer>(); Analyzer analyzer = new KoreanAnalyzer(); // hits = searcher.search(searchQuery); hits = searcher.content_search(searchQuery); numSearched += hits.totalHits; for (ScoreDoc scoreDoc : hits.scoreDocs) { doc = searcher.getDocument(scoreDoc); // 검색된 도큐먼트의 주소를 리턴. 추가정보는 C#에서 찾아오는 것??? doc_list.add(doc.get(LuceneConstants.FILE_PATH)); String filename = doc.get(LuceneConstants.FILE_PATH); int start = filename.lastIndexOf('\\'); int fine = filename.lastIndexOf('.'); filename = filename.substring(start + 1, fine); // System.out.println(filename); TokenStream stream = analyzer.tokenStream("map", new StringReader(filename)); // OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class); @SuppressWarnings("deprecation") TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); while (stream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); // System.out.println("term: " + term); if (map.containsKey(term)) { map.put(term, map.get(term) + 1); } else { map.put(term, 1); } } } // System.out.println(map); LinkedHashMap lmap = sortHashMapByValuesD(map); // System.out.println(lmap); // System.out.println(lmap.keySet().toArray()[lmap.size()-1]); ArrayList<String> term_list = new ArrayList<String>(); for (int i = 0; i < lmap.size(); i++) { if (i == LuceneConstants.SUGGESTION_NUM) break; term_list.add(lmap.keySet().toArray()[lmap.size() - i - 1].toString()); } System.out.println(term_list); sub_query.put("suggestion_keyword", term_list.toString()); sub_query.put("result", doc_list.toString()); sub_query.put("numSearched", hits.totalHits); ForSend_json.put("sub_query", sub_query); doc_list.clear(); long endTime = System.currentTimeMillis(); System.out.println(numSearched + " documents found. Time: " + (endTime - startTime)); }