@Test
  public void simpleTest() throws IOException {
    Analyzer analyzer =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
          }
        };

    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));

    assertThat(test.incrementToken(), equalTo(false));
  }
Esempio n. 2
0
  private boolean getNextToken(final int pos) throws IOException {
    assert pos >= 0;

    final boolean ret;

    if (pos == ngramSize) {
      ret = true;
    } else {
      final int ich = input.read();
      if (ich == -1) {
        termAtt.setEmpty();
        ret = false;
      } else {
        final char ch = (char) ich;
        if (ch == ' ') {
          ret = getNextToken();
        } else {
          termAtt.append(ch);
          ret = getNextToken(pos + 1);
        }
      }
    }

    return ret;
  }
Esempio n. 3
0
  @Override
  public boolean incrementToken() throws IOException {
    if (tokenIter == null || !tokenIter.hasNext()) {
      // there are no remaining tokens from the current sentence... are there more sentences?
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        // if length by start + end offsets doesn't match the term text then assume
        // this is a synonym and don't adjust the offsets.
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        // a new sentence is available: process it.
        tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset());
        tokenIter = tokenBuffer.iterator();

        // it should not be possible to have a sentence with 0 words, check just in case.
        // returning EOS isn't the best either, but its the behavior of the original code.
        if (!tokenIter.hasNext()) return false;
      } else {
        return false; // no more sentences, end of stream!
      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = tokenIter.next();
    termAtt.append(nextWord.term);
    // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
  public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {
      Tokenizer tokenizer =
          new JapaneseTokenizer(
              new StringReader(str),
              null,
              true,
              org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
      TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
      // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
      stream = new CJKWidthFilter(stream);
      // stream = new StopFilter(matchVersion, stream, stopwords);
      stream = new JapaneseKatakanaStemFilter(stream);
      // stream = new LowerCaseFilter(matchVersion, stream);

      OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
      CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

      while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        ret.add(startOffset, endOffset);
        // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
      }
    } catch (java.io.IOException e) {
      System.err.println(e);
    }
    return ret;
  }
Esempio n. 5
0
  /**
   * Adds term frequencies found by tokenizing text from reader into the Map words
   *
   * @param r a source of text to be tokenized
   * @param termFreqMap a Map of terms and their frequencies
   * @param fieldName Used by analyzer for any special per-field analysis
   */
  private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
      throws IOException {
    if (analyzer == null) {
      throw new UnsupportedOperationException(
          "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
      int tokenCount = 0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
          break;
        }
        if (isNoiseWord(word)) {
          continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
          termFreqMap.put(word, new Int());
        } else {
          cnt.x++;
        }
      }
      ts.end();
    }
  }
 /**
  * Sugar: analyzes the text with the analyzer and separates by {@link
  * SynonymMap#WORD_SEPARATOR}. reuse and its chars must not be null.
  */
 public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
   try (TokenStream ts = analyzer.tokenStream("", text)) {
     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
     PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
     ts.reset();
     reuse.clear();
     while (ts.incrementToken()) {
       int length = termAtt.length();
       if (length == 0) {
         throw new IllegalArgumentException(
             "term: " + text + " analyzed to a zero-length token");
       }
       if (posIncAtt.getPositionIncrement() != 1) {
         throw new IllegalArgumentException(
             "term: " + text + " analyzed to a token with posinc != 1");
       }
       reuse.grow(reuse.length() + length + 1); /* current + word + separator */
       int end = reuse.length();
       if (reuse.length() > 0) {
         reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
         reuse.setLength(reuse.length() + 1);
       }
       System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
       reuse.setLength(reuse.length() + length);
     }
     ts.end();
   }
   if (reuse.length() == 0) {
     throw new IllegalArgumentException(
         "term: " + text + " was completely eliminated by analyzer");
   }
   return reuse.get();
 }
 @Override
 public final boolean incrementToken() throws IOException {
   while (true) {
     if (curTermBuffer == null) {
       if (!input.incrementToken()) {
         return false;
       } else {
         curTermBuffer = termAtt.buffer().clone();
         curTermLength = termAtt.length();
         curGramSize = minGram;
         tokStart = offsetAtt.startOffset();
       }
     }
     if (curGramSize <= maxGram) {
       if (!(curGramSize
               > curTermLength // if the remaining input is too short, we can't generate any
                               // n-grams
           || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
         // grab gramSize chars from front or back
         int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
         int end = start + curGramSize;
         clearAttributes();
         offsetAtt.setOffset(tokStart + start, tokStart + end);
         termAtt.copyBuffer(curTermBuffer, start, curGramSize);
         curGramSize++;
         return true;
       }
     }
     curTermBuffer = null;
   }
 }
  /** @param analyzer @Description: */
  private static void cutWordsForSingle(Analyzer analyzer, String inputs, Random random) {
    try {
      TokenStream ts = analyzer.tokenStream("text", new StringReader(inputs));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();

      while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
          word += ts.getAttribute(CharTermAttribute.class).toString();
          word += "|" + (random.nextFloat() + 10) + " ";
          // 拼接 word|23.3 word1|43.4

          i++;
          // 添加到索引
          if (i >= throftdocnum) { // 100次一个文档
            i = 0;
            docnum++;
            // System.out.println(word);
            addDoc(docnum + patch, word);
            word = "";
          }
          j++;
        }
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  @Override
  public boolean incrementToken() throws IOException {

    if (tokenIter == null || !tokenIter.hasNext()) {
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString());
        tokenIter = tokenBuffer.iterator();
        if (!tokenIter.hasNext()) return false;

      } else {
        return false;
      }
    }

    clearAttributes();

    TokendWords nextWord = tokenIter.next();

    termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
Esempio n. 10
0
  public static List<String> analyze(String content) {
    List<String> resultList = null;
    try {
      // 创建分词对象
      resultList = new ArrayList<String>(1);
      resultList.add(content);
      IKAnalyzer analyer = new IKAnalyzer(true);
      analyer.setUseSmart(true);
      StringReader reader = new StringReader(content);
      // 分词
      TokenStream tokenStream = analyer.tokenStream("", reader);
      CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
      // 遍历分词数据
      while (tokenStream.incrementToken()) {
        if (!term.toString().isEmpty()) {
          resultList.add(term.toString());
        }
      }
      reader.close();

    } catch (IOException ex) {
      logger.error("分词出错", ex);
    }
    return resultList;
  }
Esempio n. 11
0
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      char buffer[] = termAtt.buffer();
      int length = termAtt.length();

      for (int i = 0; i < length; i++) {
        int ch = Character.codePointAt(buffer, i, length);
        // look for digits outside of basic latin
        if (ch > 0x7F && Character.isDigit(ch)) {
          // replace with equivalent basic latin digit
          buffer[i] = (char) ('0' + Character.getNumericValue(ch));
          // if the original was supplementary, shrink the string
          if (ch > 0xFFFF) {
            length = StemmerUtil.delete(buffer, i + 1, length);
            termAtt.setLength(length);
          }
        }
      }

      return true;
    } else {
      return false;
    }
  }
  /** Rigourous Test :-) */
  public void testBasicKoreanAnalyzer() throws IOException {
    // text to tokenize
    final String text = "그러면 조개가 쏘옥 올라온다";
    // String text = "※ 청년,창직,창업 인턴제";
    // String text = "저는 대학생이구요. 소프트웨어 관련학과 입니다. DB는 수업을 한번 들은 적이 있으며, 수학은 대학에서 통계학, 선형대수학, 이산수학,
    // 대학수학 등을 배웠지만... 자주 사용을 안하다보니 모두 까먹은 상태입니다.";

    BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);

    stream.reset();

    int[] lengths = new int[4];
    lengths[0] = 3;
    lengths[1] = 2;
    lengths[2] = 2;
    lengths[3] = 3;
    int i = 0;

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
      // System.out.println(termAtt.term() + ": " + termAtt.termLength() + " (" +
      // offsetAtt.startOffset() + ":" + offsetAtt.endOffset() + ")");
      Assert.assertEquals(lengths[i], termAtt.length());
      i++;
    }

    stream.end();
    stream.close();
  }
Esempio n. 13
0
  @Test
  public void testCase2() throws Exception {
    StringReader reader = new StringReader("고속도로");

    nouns.add(getToken("고속도로", 0, 4));
    nouns.add(getToken("고속도", 0, 3));
    nouns.add(getToken("고속", 0, 2));
    nouns.add(getToken("속도", 1, 3));
    nouns.add(getToken("고", 0, 1));

    Analyzer analyzer = new KoreanAnalyzer();
    TokenStream stream = analyzer.reusableTokenStream("dummy", reader);

    CharTermAttribute charTermAtt = stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offSetAtt = stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {
      TestToken t =
          getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
      System.out.println("termAtt.term() : " + charTermAtt.toString());
      System.out.println("offSetAtt : " + offSetAtt.startOffset());
      System.out.println("offSetAtt : " + offSetAtt.endOffset());

      Assert.assertTrue(nouns.contains(t));
    }
  }
Esempio n. 14
0
  @Override
  public final boolean incrementToken() throws IOException {
    if (isMailto) {
      termAtt.setEmpty();
      // return the scheme + the mail part
      isMailto = false;
      posIncrAtt.setPositionIncrement(0);
      termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position());
      return true;
    }

    if (input.incrementToken()) {
      final String type = typeAtt.type();
      if (type.equals(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI])
          && this.isMailtoScheme()) {
        this.updateBuffer();
        termBuffer.put(termAtt.buffer(), 0, termAtt.length());
        // return only the mail part
        posIncrAtt.setPositionIncrement(1);
        termAtt.copyBuffer(termBuffer.array(), 7, termBuffer.position() - 7);
      }
      return true;
    }
    return false;
  }
  /** {@inheritDoc} */
  @Override
  public boolean incrementToken() throws IOException {
    while (input.incrementToken()) {
      char[] term = termAttribute.buffer();
      int termLength = termAttribute.length();

      if (termLength > 0 && term[termLength - 1] == '-') {
        // a hyphenated word
        // capture the state of the first token only
        if (savedState == null) {
          savedState = captureState();
        }
        hyphenated.append(term, 0, termLength - 1);
      } else if (savedState == null) {
        // not part of a hyphenated word.
        return true;
      } else {
        // the final portion of a hyphenated word
        hyphenated.append(term, 0, termLength);
        unhyphenate();
        return true;
      }
    }

    if (savedState != null) {
      // the final term ends with a hyphen
      // add back the hyphen, for backwards compatibility.
      hyphenated.append('-');
      unhyphenate();
      return true;
    }

    return false;
  }
Esempio n. 16
0
  private String getAnalyzerResult(String suggestion) {
    TokenStream ts = null;
    try {
      Reader reader = new StringReader(suggestion);
      ts = this.suggestionAnalyzer.tokenStream("", reader);

      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        if (word != null && word.length() > 0) {
          return word;
        }
      }
    } catch (Exception ex) {
      if (this.field != null) {
        LOG.error(
            String.format(
                "Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}",
                this.field, suggestion),
            ex);
      } else if (this.fieldTypeName != null) {
        LOG.error(
            String.format(
                "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}",
                this.fieldTypeName, suggestion),
            ex);
      }
    } finally {
      if (ts != null) {
        IOUtils.closeWhileHandlingException(ts);
      }
    }
    return null;
  }
  @Override
  public final boolean incrementToken() throws IOException {
    if (!tokens.isEmpty()) {
      assert current != null;
      CompoundToken token = tokens.removeFirst();
      restoreState(current); // keep all other attributes untouched
      termAtt.setEmpty().append(token.txt);
      offsetAtt.setOffset(token.startOffset, token.endOffset);
      posIncAtt.setPositionIncrement(0);
      return true;
    }

    current = null; // not really needed, but for safety
    if (input.incrementToken()) {
      // Only words longer than minWordSize get processed
      if (termAtt.length() >= this.minWordSize) {
        decompose();
        // only capture the state if we really need it for producing new tokens
        if (!tokens.isEmpty()) {
          current = captureState();
        }
      }
      // return original token:
      return true;
    } else {
      return false;
    }
  }
  private String tokenizerToString(Tokenizer tokenizer) throws Exception {
    OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt =
        tokenizer.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class);
    SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class);
    PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class);

    StringBuilder result = new StringBuilder();
    tokenizer.reset();
    while (tokenizer.incrementToken() == true) {
      result.append(new String(term.buffer(), 0, term.length())).append(":");
      result.append(type.type()).append(":");
      result.append(pos.partOfSpeech()).append(":");
      result.append(semanticClass.semanticClass()).append(":");
      result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
      result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
      result.append(String.valueOf(extOffset.startOffset())).append(":");
      result.append(String.valueOf(extOffset.endOffset()));
      result.append(",");
    }
    tokenizer.end();
    return result.toString();
  }
  public List<Document> searchDocuments(String text) {
    List<Document> documents = new ArrayList<Document>();
    try {
      TokenStream tokenStream = analyzer.tokenStream("text", text);
      CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      BooleanQuery bQuery = new BooleanQuery();
      while (tokenStream.incrementToken()) {
        String token = charTermAtt.toString();
        TermQuery tq = new TermQuery(new Term("text", token));
        tq.setBoost(2f);

        bQuery.add(tq, Occur.MUST);
      }
      tokenStream.close();

      TopDocs results = searcher.search(bQuery, 100000);
      ScoreDoc[] hits = results.scoreDocs;
      for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        doc.add(new FloatField("score", hit.score, FloatField.TYPE_STORED));
        documents.add(doc);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return documents;
  }
Esempio n. 20
0
  public static void displayTokens(TokenStream stream) throws IOException {

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
      System.out.println("[" + term.toString() + "] ");
    }
  }
  /* (non-Javadoc)
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  @Override
  public boolean incrementToken() throws IOException {
    // 清除所有的词元属性
    clearAttributes();
    skippedPositions = 0;

    Lexeme nextLexeme = _IKImplement.next();
    if (nextLexeme != null) {
      posIncrAtt.setPositionIncrement(skippedPositions + 1);

      // 将Lexeme转成Attributes
      // 设置词元文本
      termAtt.append(nextLexeme.getLexemeText());
      // 设置词元长度
      termAtt.setLength(nextLexeme.getLength());
      // 设置词元位移
      offsetAtt.setOffset(
          correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));

      // 记录分词的最后位置
      endPosition = nextLexeme.getEndPosition();
      // 记录词元分类
      typeAtt.setType(nextLexeme.getLexemeTypeString());
      // 返会true告知还有下个词元
      return true;
    }
    // 返会false告知词元输出完毕
    return false;
  }
  public Map<String, Double> search(String text) {
    Map<String, Double> similar = new HashMap<String, Double>();
    try {
      TokenStream tokenStream = analyzer.tokenStream("text", text);
      CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      BooleanQuery bQuery = new BooleanQuery();
      while (tokenStream.incrementToken()) {
        String token = charTermAtt.toString();
        TermQuery tq = new TermQuery(new Term("text", token));
        tq.setBoost(2f);

        bQuery.add(tq, Occur.MUST);
      }
      tokenStream.close();

      TopDocs results = searcher.search(bQuery, 100000);
      ScoreDoc[] hits = results.scoreDocs;
      for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        similar.put(doc.get("id"), new Double(hit.score));
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return similar;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!terms.isEmpty()) {
      char[] buffer = terms.poll();
      termAttribute.setEmpty();
      termAttribute.copyBuffer(buffer, 0, buffer.length);
      posIncAttr.setPositionIncrement(1);
      return true;
    }

    if (!input.incrementToken()) {
      return false;
    } else {
      final char term[] = termAttribute.buffer();
      final int length = termAttribute.length();

      int k = 0;
      for (; k < length; k++) {
        if (term[k] == tokenDelimiter) {
          break;
        }
      }

      LinkedList<CharBuffer> buffers = permuteTerms(term, 0, length);

      Iterator iter = buffers.iterator();
      while (iter.hasNext()) {
        CharBuffer cb = (CharBuffer) iter.next();
        terms.add(cb.array());
      }

      // we return true and leave the original token unchanged
      return true;
    }
  }
  public void refineDocument(Document doc) {
    TokenStream tokenStream =
        new StandardTokenizer(Version.LUCENE_36, new StringReader(doc.getContent()));
    tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, stopWords);
    tokenStream = new PorterStemFilter(tokenStream);

    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class);

    List<String> words = new ArrayList<String>();
    Set<String> uniqueWords = new HashSet<String>();

    try {
      while (tokenStream.incrementToken()) {

        String word = charTermAttr.toString();
        //            	int wordVal = textToInt(charTermAttr.toString());
        words.add(word);
        uniqueWords.add(word);
        dictionary.add(word);

        if (sb.length() > 0) {
          sb.append(" ");
        }
        sb.append(charTermAttr.toString());
      }
    } catch (IOException e) {
      System.out.println(e.getMessage());
    }

    doc.setRefinedContent(sb.toString());
    doc.setWords(words);
    doc.setUniqueWords(uniqueWords);
  }
Esempio n. 25
0
 /**
  * Uses the solr.ASCIIFoldingFilter to convert a string to its ASCII equivalent. See solr
  * documentation for full details. </br> When doing the conversion, this method mirrors GBIF's
  * registry-solr schema configuration for <fieldType name="text_auto_ngram">. For example, it uses
  * the KeywordTokenizer that treats the entire string as a single token, regardless of its
  * content. See the solr documentation for more details. </br> This method is needed when checking
  * if the query string matches the dataset title. For example, if the query string is "straße", it
  * won't match the dataset title "Schulhof Gymnasium Hürth Bonnstrasse" unless "straße" gets
  * converted to its ASCII equivalent "strasse".
  *
  * @param q query string
  * @return query string converted to ASCII equivalent
  * @see org.gbif.portal.action.dataset.SearchAction#addMissingHighlighting(String, String)
  * @see org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
  * @see org.apache.lucene.analysis.core.KeywordTokenizer
  */
 protected static String foldToAscii(String q) {
   if (!Strings.isNullOrEmpty(q)) {
     ASCIIFoldingFilter filter = null;
     try {
       StringReader reader = new StringReader(q);
       TokenStream stream = new KeywordTokenizer(reader);
       filter = new ASCIIFoldingFilter(stream);
       CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class);
       filter.reset();
       filter.incrementToken();
       // converted q to ASCII equivalent and return it
       return termAtt.toString();
     } catch (IOException e) {
       // swallow
     } finally {
       if (filter != null) {
         try {
           filter.end();
           filter.close();
         } catch (IOException e) {
           // swallow
         }
       }
     }
   }
   return q;
 }
  /*
   * (non-Javadoc)
   *
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    skippedPositions = 0;

    while (true) {
      int tokenType = scanner.getNextToken();

      if (tokenType == StandardTokenizerInterface.YYEOF) {
        return false;
      }

      if (scanner.yylength() <= maxTokenLength) {
        posIncrAtt.setPositionIncrement(skippedPositions + 1);
        scanner.getText(termAtt);
        final int start = scanner.yychar();
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        // This 'if' should be removed in the next release. For now, it converts
        // invalid acronyms to HOST. When removed, only the 'else' part should
        // remain.
        if (tokenType == StandardTokenizer.ACRONYM_DEP) {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
          termAtt.setLength(termAtt.length() - 1); // remove extra '.'
        } else {
          typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
        }
        return true;
      } else
        // When we skip a too-long term, we still increment the
        // position increment
        skippedPositions++;
    }
  }
Esempio n. 27
0
  public void cleanText(String... inboundTexts) {
    try {
      final List<String> fields = Lists.newArrayList();
      for (String raw : inboundTexts) {
        //		        Tidy t = new Tidy();
        //		        t.setErrout(new PrintWriter(new ByteArrayOutputStream()));
        //		        StringWriter out = new StringWriter();
        //		        t.parse(new StringReader(raw), out);
        //		        String tidied = out.getBuffer().toString();
        //    		    logger.debug("{}",tidied);
        //		        AutoDetectParser p = new AutoDetectParser();
        //		        p.parse(new ByteArrayInputStream(raw.getBytes()),
        //		        		new TextContentHandler(new DefaultHandler()
        //		        {
        //		            @Override
        //		            public void characters(char[] ch, int start, int length) throws SAXException
        //		            {
        //		                CharBuffer buf = CharBuffer.wrap(ch, start, length);
        //		                String s = buf.toString();
        //		    		    logger.debug("{}",s);
        //		                fields.add(s);
        //		            }
        //		        }), new Metadata());
      }

      Analyzer analyzer = new StandardAnalyzer();
      //		    String joinedFields = Joiner.on(" ").join(fields).replaceAll("\\s+", " ");
      String joinedFields = Joiner.on(" ").join(inboundTexts).replaceAll("\\s+", " ");
      logger.debug("{}", joinedFields);
      StringReader in = new StringReader(joinedFields);
      TokenStream ts = analyzer.tokenStream("content", in);
      ts.reset();
      ts = new LowerCaseFilter(ts);

      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      List<String> words = Lists.newArrayList();
      while (ts.incrementToken()) {
        char[] termBuffer = termAtt.buffer();
        int termLen = termAtt.length();
        String w = new String(termBuffer, 0, termLen);
        words.add(w);
      }
      ts.end();
      ts.close();
      analyzer.close();
      scrubbedWords = new ArrayList<String>();
      for (String word : words) {
        if (word.length() >= MINWORDLEN && !stopwords.contains(word)) {
          scrubbedWords.add(word);
        } else {
          logger.debug("Ignoring word: {}", word);
        }
      }
      //		    this.scrubbedWords = words;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
Esempio n. 28
0
  private static String[] groupTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<TermInfo> infos = new ArrayList<TermInfo>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      TermInfo info = new TermInfo();
      info.setStart(startOffset);
      info.setEnd(endOffset);
      infos.add(info);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();

    Stack<TermInfo> tiStack = groupTokenInfos(infos);
    List<String> terms = new ArrayList<String>();
    while (!tiStack.isEmpty()) {
      TermInfo termInfo = tiStack.pop();
      if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) {
        String term = input.substring(termInfo.getStart(), termInfo.getEnd());
        terms.add(term);
      }
    }
    return terms.toArray(new String[] {});
  }
  public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings =
        Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
    Settings indexSettings =
        Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

    IndexAnalyzers indexAnalyzers =
        new AnalysisModule(new Environment(settings), emptyList())
            .getAnalysisRegistry()
            .build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
      assertNotNull(custom_analyser);
      TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
      tokenStream.reset();
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      List<String> token = new ArrayList<>();
      while (tokenStream.incrementToken()) {
        token.add(charTermAttribute.toString());
      }
      assertEquals(token.toString(), 2, token.size());
      assertEquals("j2se", token.get(0));
      assertEquals("j2ee", token.get(1));
    }

    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
      assertNotNull(custom_analyser);
      TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
      tokenStream.reset();
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      List<String> token = new ArrayList<>();
      while (tokenStream.incrementToken()) {
        token.add(charTermAttribute.toString());
      }
      assertEquals(token.toString(), 6, token.size());
      assertEquals("j", token.get(0));
      assertEquals("2", token.get(1));
      assertEquals("se", token.get(2));
      assertEquals("j", token.get(3));
      assertEquals("2", token.get(4));
      assertEquals("ee", token.get(5));
    }
  }
 @Override
 public boolean incrementToken() throws IOException {
   while (input.incrementToken()) {
     String term = new String(termAttribute.buffer(), 0, termAttribute.length());
     if (StringUtils.isAlphanumeric(term)) {
       return true;
     }
   }
   return false;
 }