@Test
  public void simpleTest() throws IOException {
    Analyzer analyzer =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader);
            return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3));
          }
        };

    TokenStream test = analyzer.tokenStream("test", "a bb ccc dddd eeeee");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("a"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("bb"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("ccc"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("ddd"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("eee"));

    assertThat(test.incrementToken(), equalTo(false));
  }
  public void refineDocument(Document doc) {
    TokenStream tokenStream =
        new StandardTokenizer(Version.LUCENE_36, new StringReader(doc.getContent()));
    tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, stopWords);
    tokenStream = new PorterStemFilter(tokenStream);

    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttr = tokenStream.getAttribute(CharTermAttribute.class);

    List<String> words = new ArrayList<String>();
    Set<String> uniqueWords = new HashSet<String>();

    try {
      while (tokenStream.incrementToken()) {

        String word = charTermAttr.toString();
        //            	int wordVal = textToInt(charTermAttr.toString());
        words.add(word);
        uniqueWords.add(word);
        dictionary.add(word);

        if (sb.length() > 0) {
          sb.append(" ");
        }
        sb.append(charTermAttr.toString());
      }
    } catch (IOException e) {
      System.out.println(e.getMessage());
    }

    doc.setRefinedContent(sb.toString());
    doc.setWords(words);
    doc.setUniqueWords(uniqueWords);
  }
Example #3
0
  @Test
  public void testCase2() throws Exception {
    StringReader reader = new StringReader("고속도로");

    nouns.add(getToken("고속도로", 0, 4));
    nouns.add(getToken("고속도", 0, 3));
    nouns.add(getToken("고속", 0, 2));
    nouns.add(getToken("속도", 1, 3));
    nouns.add(getToken("고", 0, 1));

    Analyzer analyzer = new KoreanAnalyzer();
    TokenStream stream = analyzer.reusableTokenStream("dummy", reader);

    CharTermAttribute charTermAtt = stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offSetAtt = stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {
      TestToken t =
          getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
      System.out.println("termAtt.term() : " + charTermAtt.toString());
      System.out.println("offSetAtt : " + offSetAtt.startOffset());
      System.out.println("offSetAtt : " + offSetAtt.endOffset());

      Assert.assertTrue(nouns.contains(t));
    }
  }
  public static List<String> analyze(String content) {
    List<String> resultList = null;
    try {
      // 创建分词对象
      resultList = new ArrayList<String>(1);
      resultList.add(content);
      IKAnalyzer analyer = new IKAnalyzer(true);
      analyer.setUseSmart(true);
      StringReader reader = new StringReader(content);
      // 分词
      TokenStream tokenStream = analyer.tokenStream("", reader);
      CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
      // 遍历分词数据
      while (tokenStream.incrementToken()) {
        if (!term.toString().isEmpty()) {
          resultList.add(term.toString());
        }
      }
      reader.close();

    } catch (IOException ex) {
      logger.error("分词出错", ex);
    }
    return resultList;
  }
  public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings =
        Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .build();
    Settings indexSettings =
        Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray(
                "index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

    IndexAnalyzers indexAnalyzers =
        new AnalysisModule(new Environment(settings), emptyList())
            .getAnalysisRegistry()
            .build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
      assertNotNull(custom_analyser);
      TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
      tokenStream.reset();
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      List<String> token = new ArrayList<>();
      while (tokenStream.incrementToken()) {
        token.add(charTermAttribute.toString());
      }
      assertEquals(token.toString(), 2, token.size());
      assertEquals("j2se", token.get(0));
      assertEquals("j2ee", token.get(1));
    }

    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
      assertNotNull(custom_analyser);
      TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
      tokenStream.reset();
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      List<String> token = new ArrayList<>();
      while (tokenStream.incrementToken()) {
        token.add(charTermAttribute.toString());
      }
      assertEquals(token.toString(), 6, token.size());
      assertEquals("j", token.get(0));
      assertEquals("2", token.get(1));
      assertEquals("se", token.get(2));
      assertEquals("j", token.get(3));
      assertEquals("2", token.get(4));
      assertEquals("ee", token.get(5));
    }
  }
Example #6
0
  public static void displayTokensWithFullDetails(Analyzer analyzer, String text)
      throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

      int increment = posIncr.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        System.out.println();
        System.out.print(position + ":");
      }

      BytesRef pl = payload.getPayload();

      if (pl != null) {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + ":"
                + new String(pl.bytes)
                + "] ");

      } else {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + "] ");
      }
    }
    System.out.println();
  }
 @Test
 public void test() throws IOException {
   TokenStream input = new WhitespaceTokenizer(new StringReader("abcde"));
   EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3);
   CharTermAttribute termAtt = tokenizer.getAttribute(CharTermAttribute.class);
   tokenizer.reset();
   Assert.assertTrue(tokenizer.incrementToken());
   Assert.assertEquals("a", termAtt.toString());
   Assert.assertTrue(tokenizer.incrementToken());
   Assert.assertEquals("ab", termAtt.toString());
   Assert.assertTrue(tokenizer.incrementToken());
   Assert.assertEquals("abc", termAtt.toString());
   Assert.assertFalse(tokenizer.incrementToken());
   tokenizer.close();
 }
  public boolean incrementToken() throws IOException {
    if (!morphQueue.isEmpty()) {
      restoreState(currentState);
      setAttributesFromQueue(false);
      return true;
    }

    while (input.incrementToken()) {
      final String type = typeAtt.type();
      if (KOREAN_TYPE.equals(type)) {
        try {
          analysisKorean(termAtt.toString());
        } catch (MorphException e) {
          throw new RuntimeException(e);
        }
      } else {
        return true; // pass anything else thru
      }

      if (!morphQueue.isEmpty()) {
        setAttributesFromQueue(true);
        return true;
      }
    }

    return false;
  }
  public Map<String, Double> search(String text) {
    Map<String, Double> similar = new HashMap<String, Double>();
    try {
      TokenStream tokenStream = analyzer.tokenStream("text", text);
      CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      BooleanQuery bQuery = new BooleanQuery();
      while (tokenStream.incrementToken()) {
        String token = charTermAtt.toString();
        TermQuery tq = new TermQuery(new Term("text", token));
        tq.setBoost(2f);

        bQuery.add(tq, Occur.MUST);
      }
      tokenStream.close();

      TopDocs results = searcher.search(bQuery, 100000);
      ScoreDoc[] hits = results.scoreDocs;
      for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        similar.put(doc.get("id"), new Double(hit.score));
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return similar;
  }
Example #10
0
  /**
   * Adds term frequencies found by tokenizing text from reader into the Map words
   *
   * @param r a source of text to be tokenized
   * @param termFreqMap a Map of terms and their frequencies
   * @param fieldName Used by analyzer for any special per-field analysis
   */
  private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
      throws IOException {
    if (analyzer == null) {
      throw new UnsupportedOperationException(
          "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
      int tokenCount = 0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
          break;
        }
        if (isNoiseWord(word)) {
          continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
          termFreqMap.put(word, new Int());
        } else {
          cnt.x++;
        }
      }
      ts.end();
    }
  }
Example #11
0
 @Override
 protected boolean accept() {
   // return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
   // System.out.format("%s=%s",termAtt.toString(),bloom.contains(termAtt.toString()));
   return bloom.contains(termAtt.toString());
   // return false;
 }
  public List<Document> searchDocuments(String text) {
    List<Document> documents = new ArrayList<Document>();
    try {
      TokenStream tokenStream = analyzer.tokenStream("text", text);
      CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      BooleanQuery bQuery = new BooleanQuery();
      while (tokenStream.incrementToken()) {
        String token = charTermAtt.toString();
        TermQuery tq = new TermQuery(new Term("text", token));
        tq.setBoost(2f);

        bQuery.add(tq, Occur.MUST);
      }
      tokenStream.close();

      TopDocs results = searcher.search(bQuery, 100000);
      ScoreDoc[] hits = results.scoreDocs;
      for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        doc.add(new FloatField("score", hit.score, FloatField.TYPE_STORED));
        documents.add(doc);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return documents;
  }
Example #13
0
  public static void displayTokens(TokenStream stream) throws IOException {

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
      System.out.println("[" + term.toString() + "] ");
    }
  }
  @Override
  public boolean incrementToken() throws IOException {

    if (tokenIter == null || !tokenIter.hasNext()) {
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        tokenBuffer = wordSegmenter.getTokendWords(termAtt.toString());
        tokenIter = tokenBuffer.iterator();
        if (!tokenIter.hasNext()) return false;

      } else {
        return false;
      }
    }

    clearAttributes();

    TokendWords nextWord = tokenIter.next();

    termAtt.copyBuffer(nextWord.next(), 0, nextWord.next().length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
  public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {
      Tokenizer tokenizer =
          new JapaneseTokenizer(
              new StringReader(str),
              null,
              true,
              org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
      TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
      // stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
      stream = new CJKWidthFilter(stream);
      // stream = new StopFilter(matchVersion, stream, stopwords);
      stream = new JapaneseKatakanaStemFilter(stream);
      // stream = new LowerCaseFilter(matchVersion, stream);

      OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
      CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

      while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        ret.add(startOffset, endOffset);
        // System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
      }
    } catch (java.io.IOException e) {
      System.err.println(e);
    }
    return ret;
  }
Example #16
0
  @Override
  public boolean incrementToken() throws IOException {
    if (tokenIter == null || !tokenIter.hasNext()) {
      // there are no remaining tokens from the current sentence... are there more sentences?
      if (input.incrementToken()) {
        tokStart = offsetAtt.startOffset();
        tokEnd = offsetAtt.endOffset();
        // if length by start + end offsets doesn't match the term text then assume
        // this is a synonym and don't adjust the offsets.
        hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
        // a new sentence is available: process it.
        tokenBuffer = splitIntoTokens(termAtt.toString(), offsetAtt.startOffset());
        tokenIter = tokenBuffer.iterator();

        // it should not be possible to have a sentence with 0 words, check just in case.
        // returning EOS isn't the best either, but its the behavior of the original code.
        if (!tokenIter.hasNext()) return false;
      } else {
        return false; // no more sentences, end of stream!
      }
    }
    // WordTokenFilter must clear attributes, as it is creating new tokens.
    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one.
    SegToken nextWord = tokenIter.next();
    termAtt.append(nextWord.term);
    // termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
    if (hasIllegalOffsets) {
      offsetAtt.setOffset(tokStart, tokEnd);
    } else {
      offsetAtt.setOffset(nextWord.start, nextWord.end);
    }
    typeAtt.setType("word");
    return true;
  }
Example #17
0
 /**
  * Uses the solr.ASCIIFoldingFilter to convert a string to its ASCII equivalent. See solr
  * documentation for full details. </br> When doing the conversion, this method mirrors GBIF's
  * registry-solr schema configuration for <fieldType name="text_auto_ngram">. For example, it uses
  * the KeywordTokenizer that treats the entire string as a single token, regardless of its
  * content. See the solr documentation for more details. </br> This method is needed when checking
  * if the query string matches the dataset title. For example, if the query string is "straße", it
  * won't match the dataset title "Schulhof Gymnasium Hürth Bonnstrasse" unless "straße" gets
  * converted to its ASCII equivalent "strasse".
  *
  * @param q query string
  * @return query string converted to ASCII equivalent
  * @see org.gbif.portal.action.dataset.SearchAction#addMissingHighlighting(String, String)
  * @see org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
  * @see org.apache.lucene.analysis.core.KeywordTokenizer
  */
 protected static String foldToAscii(String q) {
   if (!Strings.isNullOrEmpty(q)) {
     ASCIIFoldingFilter filter = null;
     try {
       StringReader reader = new StringReader(q);
       TokenStream stream = new KeywordTokenizer(reader);
       filter = new ASCIIFoldingFilter(stream);
       CharTermAttribute termAtt = filter.addAttribute(CharTermAttribute.class);
       filter.reset();
       filter.incrementToken();
       // converted q to ASCII equivalent and return it
       return termAtt.toString();
     } catch (IOException e) {
       // swallow
     } finally {
       if (filter != null) {
         try {
           filter.end();
           filter.close();
         } catch (IOException e) {
           // swallow
         }
       }
     }
   }
   return q;
 }
Example #18
0
  private String getAnalyzerResult(String suggestion) {
    TokenStream ts = null;
    try {
      Reader reader = new StringReader(suggestion);
      ts = this.suggestionAnalyzer.tokenStream("", reader);

      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        if (word != null && word.length() > 0) {
          return word;
        }
      }
    } catch (Exception ex) {
      if (this.field != null) {
        LOG.error(
            String.format(
                "Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}",
                this.field, suggestion),
            ex);
      } else if (this.fieldTypeName != null) {
        LOG.error(
            String.format(
                "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}",
                this.fieldTypeName, suggestion),
            ex);
      }
    } finally {
      if (ts != null) {
        IOUtils.closeWhileHandlingException(ts);
      }
    }
    return null;
  }
Example #19
0
  public void processInput() {

    ClassLoader classLoader = getClass().getClassLoader();
    File englishStopWords =
        new File(classLoader.getResource("DEFAULT_ENGLISH_STOP_WORDS").getFile());

    processedInput = new ArrayList<String>();
    lemmatizedInput = new ArrayList<String>();

    // tokenize and stop word removal operations

    initStopWordList(englishStopWords);

    CharArraySet stopwords = new CharArraySet(stopWordPool, true);
    StandardAnalyzer analyzer = new StandardAnalyzer(stopwords);
    TokenStream stream;
    try {
      stream = analyzer.tokenStream(null, new StringReader(input));
      CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
      stream.reset();
      while (stream.incrementToken()) {
        if (!processedInput.contains(cattr.toString())) processedInput.add(cattr.toString());
      }
      stream.end();
      stream.close();

      // System.out.println("In input processing " + "      "
      // + processedInput);
      setProcessedInput(processedInput);

      // for lemmatization concatinate input strings and send to Standford
      // NLP processor

      for (int i = 0; i < processedInput.size(); i++) {
        lemmatizedInput.addAll(new StanfordLemmatizer().lemmatize(processedInput.get(i)));
      }

      // System.out.println("In input processing " + "      "
      // + lemmatizedInput);

      setLemmatizedInput(lemmatizedInput);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  public void testPerField() throws Exception {
    String text = "Qwerty";
    PerFieldAnalyzerWrapper analyzer =
        new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
    analyzer.addAnalyzer("special", new SimpleAnalyzer(TEST_VERSION_CURRENT));

    TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text));
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);

    assertTrue(tokenStream.incrementToken());
    assertEquals("WhitespaceAnalyzer does not lowercase", "Qwerty", termAtt.toString());

    tokenStream = analyzer.tokenStream("special", new StringReader(text));
    termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    assertTrue(tokenStream.incrementToken());
    assertEquals("SimpleAnalyzer lowercases", "qwerty", termAtt.toString());
  }
Example #21
0
  private static String[] groupTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<TermInfo> infos = new ArrayList<TermInfo>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      TermInfo info = new TermInfo();
      info.setStart(startOffset);
      info.setEnd(endOffset);
      infos.add(info);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();

    Stack<TermInfo> tiStack = groupTokenInfos(infos);
    List<String> terms = new ArrayList<String>();
    while (!tiStack.isEmpty()) {
      TermInfo termInfo = tiStack.pop();
      if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) {
        String term = input.substring(termInfo.getStart(), termInfo.getEnd());
        terms.add(term);
      }
    }
    return terms.toArray(new String[] {});
  }
  /* (non-Javadoc)
   * @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
   */
  @Override
  public Query getQuery(Element e) throws ParserException {
    String fieldsList = e.getAttribute("fieldNames"); // a comma-delimited list of fields
    String fields[] = defaultFieldNames;
    if ((fieldsList != null) && (fieldsList.trim().length() > 0)) {
      fields = fieldsList.trim().split(",");
      // trim the fieldnames
      for (int i = 0; i < fields.length; i++) {
        fields[i] = fields[i].trim();
      }
    }

    // Parse any "stopWords" attribute
    // TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
    // I use all analyzers/fields to generate multi-field compatible stop list
    String stopWords = e.getAttribute("stopWords");
    Set<String> stopWordsSet = null;
    if ((stopWords != null) && (fields != null)) {
      stopWordsSet = new HashSet<String>();
      for (String field : fields) {
        try (TokenStream ts = analyzer.tokenStream(field, stopWords)) {
          CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
          ts.reset();
          while (ts.incrementToken()) {
            stopWordsSet.add(termAtt.toString());
          }
          ts.end();
          ts.close();
        } catch (IOException ioe) {
          throw new ParserException(
              "IoException parsing stop words list in "
                  + getClass().getName()
                  + ":"
                  + ioe.getLocalizedMessage());
        }
      }
    }

    MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.getText(e), fields, analyzer, fields[0]);
    mlt.setMaxQueryTerms(DOMUtils.getAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS));
    mlt.setMinTermFrequency(
        DOMUtils.getAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY));
    mlt.setPercentTermsToMatch(
        DOMUtils.getAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100);
    mlt.setStopWords(stopWordsSet);
    int minDocFreq = DOMUtils.getAttribute(e, "minDocFreq", -1);
    if (minDocFreq >= 0) {
      mlt.setMinDocFreq(minDocFreq);
    }

    mlt.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));

    return mlt;
  }
Example #23
0
 private List<String> filter(TokenFilter filter) throws IOException {
   List<String> tas = new ArrayList<>();
   CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
   filter.reset();
   while (filter.incrementToken()) {
     tas.add(termAtt.toString());
   }
   filter.end();
   filter.close();
   return tas;
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (inPhrase) {
     inPhrase = false;
     clearAttributes();
     termAtt.setEmpty().append("phrase2");
     offsetAtt.setOffset(savedStart, savedEnd);
     return true;
   } else
     while (input.incrementToken()) {
       if (termAtt.toString().equals("phrase")) {
         inPhrase = true;
         savedStart = offsetAtt.startOffset();
         savedEnd = offsetAtt.endOffset();
         termAtt.setEmpty().append("phrase1");
         offsetAtt.setOffset(savedStart, savedEnd);
         return true;
       } else if (!termAtt.toString().equals("stop")) return true;
     }
   return false;
 }
 public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output)
     throws Exception {
   TokenStream stream = analyzer.tokenStream("field", new StringReader(input));
   stream.reset();
   CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
   for (String expected : output) {
     Assert.assertTrue(stream.incrementToken());
     Assert.assertEquals(expected, termAttr.toString());
   }
   Assert.assertFalse(stream.incrementToken());
   stream.close();
 }
 protected Term getAnalyzedTerm(TokenType tokenType, String termString) throws IOException {
   Term term = getTerm(termString, tokenType); // first ensure that we've stripped any prefixes
   TokenStream tokenStream = analyzer.tokenStream(term.field(), new StringReader(term.text()));
   tokenStream.reset();
   CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
   StringBuilder sb = new StringBuilder();
   while (tokenStream.incrementToken()) {
     sb.append(termAtt.toString());
   }
   tokenStream.end();
   tokenStream.close();
   return new Term(term.field(), sb.toString());
 }
Example #27
0
 /**
  * Convert tokenStream object into a string.
  *
  * @param tokenStream object returned by Lucene tokenizer
  * @return String corresponding to the tokens output by tokenStream
  */
 protected static String streamToString(TokenStream tokenStream) {
   CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
   tokenStream.clearAttributes();
   StringBuilder tokenized = new StringBuilder();
   try {
     while (tokenStream.incrementToken()) {
       tokenized.append(termAtt.toString() + " ");
     }
   } catch (IOException e) {
     e.printStackTrace();
   }
   return tokenized.toString().trim();
 }
 public static void assertSimpleTSOutput(TokenStream stream, String[] expected)
     throws IOException {
   stream.reset();
   CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
   assertThat(termAttr, notNullValue());
   int i = 0;
   while (stream.incrementToken()) {
     assertThat(expected.length, greaterThan(i));
     assertThat(
         "expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
   }
   assertThat("not all tokens produced", i, equalTo(expected.length));
 }
Example #29
0
    /**
     * @param text
     * @return
     * @throws IOException
     */
    private String tokenize(String text) throws IOException {
      TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
      StringBuilder stBld = new StringBuilder();

      CharTermAttribute termAttribute =
          (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
      while (stream.incrementToken()) {
        String token = termAttribute.toString();
        stBld.append(token).append(" ");
      }

      return stBld.toString();
    }
  public void analyze(String text) throws IOException {
    List<String> searchlst = new ArrayList<String>();

    proposalController.getProposalList().clear();
    String query = "";
    System.out.println("Analzying \"" + text + "\"");

    Analyzer analyzer = new RussianAnalyzer(Version.LUCENE_31);
    System.out.println("\t" + analyzer.getClass().getName() + ":");
    System.out.print("\t\t");
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    while (true) {
      if (!stream.incrementToken()) break;

      AttributeSource token = stream.cloneAttributes();
      CharTermAttribute term = (CharTermAttribute) token.addAttribute(CharTermAttribute.class);
      System.out.print("[" + term.toString() + "] "); // 2
      searchlst.add(term.toString());
    }

    int i = 0;
    for (String param : searchlst) {

      if (i < searchlst.size() - 1) {
        query += param + " AND ";
      } else {
        query += param;
      }
      i++;
    }

    _log.info("Запрос для поиска:" + query);
    startSearch(query);
    System.out.println("\n");
  }