Beispiel #1
0
  static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer)
      throws IOException {
    List<String> terms = new ArrayList<String>();

    // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the
    // type.
    if (localText == null) {
      throw new SearchException(
          "Search parameter on field "
              + fieldName
              + " could not be converted. "
              + "Are the parameter and the field of the same type?"
              + "Alternatively, apply the ignoreFieldBridge() option to "
              + "pass String parameters");
    }
    Reader reader = new StringReader(localText);
    TokenStream stream = analyzer.reusableTokenStream(fieldName, reader);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
      if (attribute.termLength() > 0) {
        String term = attribute.term();
        terms.add(term);
      }
    }
    stream.end();
    stream.close();
    return terms;
  }
Beispiel #2
0
 /**
  * 查看分词后的语汇单元细节,只打印分词结果
  *
  * @param analyzer
  * @param text
  * @throws IOException
  */
 public static void displaySimpleTokens(Analyzer analyzer, String text) throws IOException {
   TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(text));
   TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
   while (tokenStream.incrementToken()) {
     System.out.print(termAttribute.term() + ",");
   }
   System.out.println();
 }
Beispiel #3
0
 private String analyzeQuery(String query) throws IOException {
   StringBuilder result = new StringBuilder();
   ASCIIFoldingFilter filter =
       new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
   TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
   while (filter.incrementToken()) {
     result.append(termAttribute.term()).append("* ");
   }
   return result.toString();
 }
Beispiel #4
0
  @Override
  public boolean incrementToken() throws IOException {

    while (input.incrementToken()) {
      char text[] = termAtt.termBuffer();
      int termLength = termAtt.termLength();
      if (!stopTable.contains(text, 0, termLength)) {
        return true;
      }
    }
    return false;
  }
  public boolean incrementToken() throws IOException {
    if (!input.incrementToken()) // Advance to next token
    return false; // When false, end has been reached

    String encoded;
    encoded = metaphoner.encode(termAttr.term()); // Convert term text to
    // Metaphone encoding
    termAttr.setTermBuffer(encoded); // Overwrite term text with encoded
    // text
    typeAttr.setType(METAPHONE); // Set token type
    return true;
  }
  public final boolean incrementToken() throws IOException {
    int increment = 0;
    while (input.incrementToken()) {
      if (!stopWords.contains(termAttr.termBuffer(), 0, termAttr.termLength())) {
        posIncrAttr.setPositionIncrement(posIncrAttr.getPositionIncrement() + increment);
        return true;
      }

      increment += posIncrAttr.getPositionIncrement();
    }

    return false;
  }
  /** Simple command-line based search demo. */
  public static void main(String[] args) throws Exception {

	  System.out.println("Hello");
	  KoreanAnalyzer ka = new KoreanAnalyzer();
	  TokenStream ts = ka.tokenStream("", new java.io.StringReader("과학기술이 정말 I an Hello"));
	  System.out.println(ts.toString());
	  try{
	  while (ts.incrementToken()){
	  org.apache.lucene.analysis.tokenattributes.TermAttribute ta = ts.getAttribute( org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
	  System.out.println("adf"+ta.term());
	  }
	  }catch (Exception e){System.out.println(e.toString());}


	  }
  private void dumpSpans(SpanQuery query) throws IOException {
    Spans spans = query.getSpans(reader);
    System.out.println(query + ":");
    int numSpans = 0;

    TopDocs hits = searcher.search(query, 10);
    float[] scores = new float[2];
    for (ScoreDoc sd : hits.scoreDocs) {
      scores[sd.doc] = sd.score;
    }

    while (spans.next()) { // A
      numSpans++;

      int id = spans.doc();
      Document doc = reader.document(id); // B

      TokenStream stream =
          analyzer.tokenStream(
              "contents", // C
              new StringReader(doc.get("f"))); // C
      TermAttribute term = stream.addAttribute(TermAttribute.class);

      StringBuilder buffer = new StringBuilder();
      buffer.append("   ");
      int i = 0;
      while (stream.incrementToken()) { // D
        if (i == spans.start()) { // E
          buffer.append("<"); // E
        } // E
        buffer.append(term.term()); // E
        if (i + 1 == spans.end()) { // E
          buffer.append(">"); // E
        } // E
        buffer.append(" ");
        i++;
      }
      buffer.append("(").append(scores[id]).append(") ");
      System.out.println(buffer);
    }

    if (numSpans == 0) {
      System.out.println("   No spans");
    }
    System.out.println();
  }
 public boolean incrementToken() throws IOException {
   if (inPhrase) {
     inPhrase = false;
     clearAttributes();
     termAtt.setTermBuffer("phrase2");
     offsetAtt.setOffset(savedStart, savedEnd);
     return true;
   } else
     while (input.incrementToken()) {
       if (termAtt.term().equals("phrase")) {
         inPhrase = true;
         savedStart = offsetAtt.startOffset();
         savedEnd = offsetAtt.endOffset();
         termAtt.setTermBuffer("phrase1");
         offsetAtt.setOffset(savedStart, savedEnd);
         return true;
       } else if (!termAtt.term().equals("stop")) return true;
     }
   return false;
 }
  @Override
  public boolean incrementToken() throws IOException {
    if (!input.incrementToken()) return false;

    char[] termBuffer = termAtt.termBuffer();
    int termBufferLength = termAtt.termLength();
    char[] backup = null;
    if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
      // make a backup in case we exceed the word count
      backup = new char[termBufferLength];
      System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
    }
    if (termBufferLength < factory.maxTokenLength) {
      int wordCount = 0;

      int lastWordStart = 0;
      for (int i = 0; i < termBufferLength; i++) {
        char c = termBuffer[i];
        if (c <= ' ' || c == '.') {
          int len = i - lastWordStart;
          if (len > 0) {
            factory.processWord(termBuffer, lastWordStart, len, wordCount++);
            lastWordStart = i + 1;
            i++;
          }
        }
      }

      // process the last word
      if (lastWordStart < termBufferLength) {
        factory.processWord(
            termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
      }

      if (wordCount > factory.maxWordCount) {
        termAtt.setTermBuffer(backup, 0, termBufferLength);
      }
    }

    return true;
  }
Beispiel #11
0
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    int posIncr = 1;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerImpl.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(posIncr);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength()));
        // This 'if' should be removed in the next release. For now, it converts
        // invalid acronyms to HOST. When removed, only the 'else' part should
        // remain.
        if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
          if (replaceInvalidAcronym) {
            typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
            termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.'
          } else {
            typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
          }
        } else {
          typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
        }
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        posIncr++;
    }
  }
Beispiel #12
0
  /**
   * 查看分析器生成的语汇单元细节
   *
   * @param analyzer
   * @param text
   * @throws IOException
   */
  public static void displayTokens(Analyzer analyzer, String text) throws IOException {
    // 语汇单元流
    TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(text));

    // 获取语汇单元的属性
    TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
    // 位置增量,在短语查询的时候,同义词查询的时候有作用
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    // 偏移量,高亮查询匹配结果有用
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    // 语汇单元类型,普通是word,还有email等
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);

    int position = 0;
    while (tokenStream.incrementToken()) {
      // 计算位置信息
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
      }
      // 打印所有语汇单元的细节信息
      System.out.println(
          "position : "
              + position
              + " ["
              + termAttribute.term()
              + ":"
              + offsetAttribute.startOffset()
              + "->"
              + offsetAttribute.endOffset()
              + ":"
              + typeAttribute.type()
              + "]");
    }
  }
  private void splitIntoTokens() {
    String term = termAtt.term();
    String[] termParts = splitTerm(term);

    if (termParts.length > 1) {
      int termPos = offsetAtt.startOffset();

      for (int i = 0; i < termParts.length; i++) {
        String termPart = termParts[i];
        int termPartPos = termPos + term.indexOf(termPart);
        int termPartEndPos = termPartPos + termPart.length();

        Token newToken = new Token(termPart, termPartPos, termPartEndPos);
        newToken.setPositionIncrement(0); // in the same position

        tokens.add(newToken);
      }
    }
  }
  /**
   * Parses the file to extract all the words for indexing and some data characterizing the file.
   *
   * @param file contains the fullpath of the document to parse
   * @param indexerLanguage this will be used to tell the program which stemmer to be used.
   * @param stem if true then generate js files with words stemmed
   * @return a DitaFileInfo object filled with data describing the file
   */
  public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) {
    // initialization
    fileDesc = new DocFileInfo(file);
    strbf = new StringBuffer("");

    // Fill strbf by parsing the file
    parseDocument(file);

    String str = cleanBuffer(strbf);
    str = str.replaceAll("\\s+", " "); // there's still redundant spaces in the middle
    //		System.out.println(file.toString()+" "+ str +"\n");
    // START OXYGEN PATCH
    //		String[] items = str.split("\\s");      //contains all the words in the array
    // END OXYGEN PATCH

    // get items one-by-one, tunnel through the stemmer, and get the stem.
    // Then, add them to tempSet
    // Do Stemming for words in items
    // TODO currently, stemming support is for english and german only. Add support for other
    // languages as well.

    // START OXYGEN PATCH
    wsList = new ArrayList<WordAndScoring>();
    // START OXYGEN PATCH, create the words and scoring list
    //        String[] tokenizedItems;
    // END OXYGEN PATCH
    if (indexerLanguage.equalsIgnoreCase("ja")
        || indexerLanguage.equalsIgnoreCase("zh")
        || indexerLanguage.equalsIgnoreCase("ko")) {
      LinkedList<String> tokens = new LinkedList<String>();
      try {
        // EXM-21501 Oxygen patch, replace the extra "@@@"s.
        str = str.replaceAll("@@@([^\\s]*)@@@", "");
        CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);
        Reader reader = new StringReader(str);
        TokenStream stream = analyzer.tokenStream("", reader);
        TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
        OffsetAttribute offAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);

        while (stream.incrementToken()) {
          String term = termAtt.term();
          tokens.add(term);
          WordAndScoring ws = new WordAndScoring(term, term, 1);
          boolean found = false;
          for (WordAndScoring aWsList : wsList) {
            // If the stem of the current word is already in list,
            // do not add the word in the list, just recompute scoring
            if (aWsList.getStem().equals(ws.getStem())) {
              found = true;
              int scoring = aWsList.getScoring();
              aWsList.setScoring(scoring + ws.getScoring());
              break;
            }
          }
          if (!found) {
            wsList.add(ws);
          }
        }
        // START OXYGEN PATCH
        // tokenizedItems = tokens.toArray(new String[tokens.size()]);
        // END OXYGEN PATCH

      } catch (IOException ex) {
        // START OXYGEN PATCH
        //                tokenizedItems = items;
        // END OXYGEN PATCH
        System.out.println("Error tokenizing content using CJK Analyzer. IOException");
        ex.printStackTrace();
      }
    } else {
      SnowballStemmer stemmer;
      if (indexerLanguage.equalsIgnoreCase("en")) {
        stemmer = new EnglishStemmer();
      } else if (indexerLanguage.equalsIgnoreCase("de")) {
        stemmer = new GermanStemmer();
      } else if (indexerLanguage.equalsIgnoreCase("fr")) {
        stemmer = new FrenchStemmer();
      } else {
        stemmer =
            null; // Languages which stemming is not yet supported.So, No stemmers will be used.
      }
      // START OXYGEN PATCH
      wsList = new ArrayList<WordAndScoring>();
      StringTokenizer st = new StringTokenizer(str, " ");
      // Tokenize the string and populate the words and scoring list
      while (st.hasMoreTokens()) {
        String token = st.nextToken();
        WordAndScoring ws = getWordAndScoring(token, stemmer, stem);
        if (ws != null) {
          boolean found = false;
          for (WordAndScoring aWsList : wsList) {
            // If the stem of the current word is already in list,
            // do not add the word in the list, just recompute scoring
            if (aWsList.getStem().equals(ws.getStem())) {
              found = true;
              int scoring = aWsList.getScoring();
              aWsList.setScoring(scoring + ws.getScoring());
              break;
            }
          }
          if (!found) {
            wsList.add(ws);
          }
        }
      }
      //            if(stemmer != null)             //If a stemmer available
      //                tokenizedItems = stemmer.doStem(items.toArray(new String[0]));
      //            else                            //if no stemmer available for the particular
      // language
      //                tokenizedItems = items.toArray(new String[0]);
      // END OXYGEN PATCH

    }

    /* for(String stemmedItem: tokenizedItems){
        System.out.print(stemmedItem+"| ");
    }*/

    // START OXYGEN PATCH
    //		//items: remove the duplicated strings first
    //		HashSet <String> tempSet = new HashSet<String>();
    //      tempSet.addAll(Arrays.asList(tokenizedItems));
    //		Iterator it = tempSet.iterator();
    // Iterate over the words and scoring list
    Iterator<WordAndScoring> it = wsList.iterator();
    WordAndScoring s;
    while (it.hasNext()) {
      s = it.next();
      // Do not add results from 'toc.html'
      if (s != null && tempDico.containsKey(s.getStem())) {
        String temp = tempDico.get(s.getStem());
        temp =
            temp.concat(",")
                .concat(Integer.toString(i))
                // Concat also the scoring for the stem
                .concat("*")
                .concat(Integer.toString(s.getScoring()));
        // System.out.println("temp="+s+"="+temp);
        tempDico.put(s.getStem(), temp);
      } else if (s != null) {
        String temp = null;
        temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));
        tempDico.put(s.getStem(), temp);
      }
      // END OXYGEN PATCH
    }

    i++;
    return fileDesc;
  }
 private void applyToken(Token token) {
   termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
   posAtt.setPositionIncrement(token.getPositionIncrement());
   offsetAtt.setOffset(token.startOffset(), token.endOffset());
 }
  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int startOffsets[],
      int endOffsets[],
      String types[],
      int posIncrements[],
      Integer finalOffset)
      throws IOException {
    assertNotNull(output);
    CheckClearAttributesAttribute checkClearAtt =
        (CheckClearAttributesAttribute) ts.addAttribute(CheckClearAttributesAttribute.class);

    assertTrue("has no TermAttribute", ts.hasAttribute(TermAttribute.class));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
      assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
      offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
      assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
      typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null) {
      assertTrue(
          "has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
      posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class);
    }

    ts.reset();
    for (int i = 0; i < output.length; i++) {
      // extra safety to enforce, that the state is not preserved and also assign bogus values
      ts.clearAttributes();
      termAtt.setTermBuffer("bogusTerm");
      if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
      if (typeAtt != null) typeAtt.setType("bogusType");
      if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);

      checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
      assertTrue("token " + i + " does not exist", ts.incrementToken());
      assertTrue(
          "clearAttributes() was not called correctly in TokenStream chain",
          checkClearAtt.getAndResetClearCalled());

      assertEquals("term " + i, output[i], termAtt.term());
      if (startOffsets != null)
        assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset());
      if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset());
      if (types != null) assertEquals("type " + i, types[i], typeAtt.type());
      if (posIncrements != null)
        assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement());
    }
    assertFalse("end of stream", ts.incrementToken());
    ts.end();
    if (finalOffset != null)
      assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
    ts.close();
  }
Beispiel #17
0
  @SuppressWarnings("unchecked")
  private void sub_search(String searchQuery) throws IOException, ParseException {
    searcher = new Searcher(indexDir);

    int numSearched = 0; // 찾아진 총 문서 수
    ArrayList<String> doc_list = new ArrayList<String>();

    TopDocs hits;
    Document doc;

    long startTime = System.currentTimeMillis();
    // 모든 검색 결과와 정보들은 ForSend_json에 json형태로 저장해서 out.txt에 저장된다.
    // * 메인 쿼리에 대한 검색 *//
    JSONObject sub_query = new JSONObject();
    HashMap<String, Integer> map = new HashMap<String, Integer>();
    Analyzer analyzer = new KoreanAnalyzer();

    // hits = searcher.search(searchQuery);
    hits = searcher.content_search(searchQuery);
    numSearched += hits.totalHits;
    for (ScoreDoc scoreDoc : hits.scoreDocs) {
      doc = searcher.getDocument(scoreDoc);
      // 검색된 도큐먼트의 주소를 리턴. 추가정보는 C#에서 찾아오는 것???
      doc_list.add(doc.get(LuceneConstants.FILE_PATH));

      String filename = doc.get(LuceneConstants.FILE_PATH);
      int start = filename.lastIndexOf('\\');
      int fine = filename.lastIndexOf('.');
      filename = filename.substring(start + 1, fine);
      // System.out.println(filename);
      TokenStream stream = analyzer.tokenStream("map", new StringReader(filename));
      // OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
      @SuppressWarnings("deprecation")
      TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);

      while (stream.incrementToken()) {
        // int startOffset = offsetAttribute.startOffset();
        // int endOffset = offsetAttribute.endOffset();
        String term = termAttribute.term();
        // System.out.println("term: " + term);
        if (map.containsKey(term)) {
          map.put(term, map.get(term) + 1);
        } else {
          map.put(term, 1);
        }
      }
    }

    // System.out.println(map);
    LinkedHashMap lmap = sortHashMapByValuesD(map);
    // System.out.println(lmap);
    // System.out.println(lmap.keySet().toArray()[lmap.size()-1]);

    ArrayList<String> term_list = new ArrayList<String>();
    for (int i = 0; i < lmap.size(); i++) {
      if (i == LuceneConstants.SUGGESTION_NUM) break;
      term_list.add(lmap.keySet().toArray()[lmap.size() - i - 1].toString());
    }
    System.out.println(term_list);

    sub_query.put("suggestion_keyword", term_list.toString());
    sub_query.put("result", doc_list.toString());
    sub_query.put("numSearched", hits.totalHits);
    ForSend_json.put("sub_query", sub_query);
    doc_list.clear();

    long endTime = System.currentTimeMillis();
    System.out.println(numSearched + " documents found. Time: " + (endTime - startTime));
  }