Esempio n. 1
0
  public void test() throws Exception {
    final CharArraySet cas = new CharArraySet(3, false);
    cas.add("jjp");
    cas.add("wlmwoknt");
    cas.add("tcgyreo");

    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("mtqlpi", "");
    builder.add("mwoknt", "jjp");
    builder.add("tcgyreo", "zpfpajyws");
    final NormalizeCharMap map = builder.build();

    Analyzer a =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65);
            TokenFilter f = new CommonGramsFilter(t, cas);
            return new TokenStreamComponents(t, f);
          }

          @Override
          protected Reader initReader(String fieldName, Reader reader) {
            reader = new MockCharFilter(reader, 0);
            reader = new MappingCharFilter(map, reader);
            reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
            return reader;
          }
        };
    checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
    a.close();
  }
Esempio n. 2
0
 public void testWithKeywordAttribute() throws IOException {
   CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
   set.add("yourselves");
   Tokenizer tokenizer =
       new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
   TokenStream filter = new PorterStemFilter(new KeywordMarkerFilter(tokenizer, set));
   assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
 }
Esempio n. 3
0
  public void testWithKeywordAttribute() throws IOException {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
    set.add("ساهدهات");
    MockTokenizer tokenStream =
        new MockTokenizer(new StringReader("ساهدهات"), MockTokenizer.WHITESPACE, false);

    ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerFilter(tokenStream, set));
    assertTokenStreamContents(filter, new String[] {"ساهدهات"});
  }
Esempio n. 4
0
 public void testWithKeywordAttribute() throws IOException {
   CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
   set.add("fischen");
   GermanStemFilter filter =
       new GermanStemFilter(
           new KeywordMarkerFilter(
               new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Fischen Trinken")),
               set));
   assertTokenStreamContents(filter, new String[] {"fischen", "trink"});
 }
 public void testWithKeywordAttribute() throws IOException {
   CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
   set.add("hole");
   CzechStemFilter filter =
       new CzechStemFilter(
           new KeywordMarkerFilter(
               new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false),
               set));
   assertTokenStreamContents(filter, new String[] {"hole", "desk"});
 }
 private CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase)
     throws IOException {
   List<String> files = splitFileNames(wordFiles);
   CharArraySet words = null;
   if (files.size() > 0) {
     // default stopwords list has 35 or so words, but maybe don't make it that
     // big to start
     words = new CharArraySet(files.size() * 10, ignoreCase);
     for (String file : files) {
       List<String> wlist = getLines(loader, file.trim());
       words.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
     }
   }
   return words;
 }
Esempio n. 7
0
 private static CharArraySet resolveNamedWords(
     Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) {
   if (namedWords == null) {
     return new CharArraySet(words, ignoreCase);
   }
   CharArraySet setWords = new CharArraySet(words.size(), ignoreCase);
   for (String word : words) {
     if (namedWords.containsKey(word)) {
       setWords.addAll(namedWords.get(word));
     } else {
       setWords.add(word);
     }
   }
   return setWords;
 }
 /**
  * Create a new KeywordMarkerFilter, that marks the current token as a keyword if the tokens term
  * buffer is contained in the given set via the {@link KeywordAttribute}.
  *
  * @param in TokenStream to filter
  * @param keywordSet the keywords set to lookup the current termbuffer
  */
 public KeywordMarkerFilter(final TokenStream in, final Set<?> keywordSet) {
   this(
       in,
       keywordSet instanceof CharArraySet
           ? (CharArraySet) keywordSet
           : CharArraySet.copy(Version.LUCENE_31, keywordSet));
 }
  private CharArraySet remove(CharArraySet fromSet, char[] charArray) {
    // System.out.println( "remove from: " + new String( charArray ));
    CharArraySet newSet = new CharArraySet(5, false);
    Iterator<Object> phraseIt = currentSetToCheck.iterator();
    while (phraseIt != null && phraseIt.hasNext()) {
      char[] phrase = (char[]) phraseIt.next();

      // if (!equals( phrase, charArray) && (startsWith( charArray, phrase ) || endsWith( charArray,
      // phrase))) {
      if (!equals(phrase, charArray) && startsWith(phrase, charArray)
          || endsWith(charArray, phrase)) {
        newSet.add(phrase);
      } else {
        // System.out.println( "removing " + new String( phrase ));
      }
    }

    return newSet;
  }
 /**
  * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
  * the text in the provided {@link Reader}.
  *
  * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
  *     StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter}, {@link
  *     StopFilter} , {@link KeywordMarkerFilter} if a stem exclusion set is provided, and {@link
  *     SnowballFilter}
  */
 @Override
 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
   if (matchVersion.onOrAfter(Version.LUCENE_31)) {
     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet);
     result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
     return new TokenStreamComponents(source, result);
   } else {
     final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     result = new StopFilter(matchVersion, result, stopwords);
     if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet);
     result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
     return new TokenStreamComponents(source, result);
   }
 }
 /**
  * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
  * the text in the provided {@link Reader}.
  *
  * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
  *     {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter},
  *     {@link StopFilter} , {@link KeywordMarkerFilter} if a stem exclusion set is provided and
  *     {@link BulgarianStemFilter}.
  */
 @Override
 public TokenStreamComponents createComponents(String fieldName, Reader reader) {
   final Tokenizer source = new StandardTokenizer(matchVersion, reader);
   TokenStream result = new StandardFilter(matchVersion, source);
   result = new LowerCaseFilter(matchVersion, result);
   result = new StopFilter(matchVersion, result, stopwords);
   if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet);
   result = new BulgarianStemFilter(result);
   return new TokenStreamComponents(source, result);
 }
Esempio n. 12
0
 /**
  * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
  * the text in the provided {@link Reader}.
  *
  * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
  *     StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter}, {@link
  *     StopFilter} , {@link SetKeywordMarkerFilter} if a stem exclusion set is provided, and
  *     {@link SnowballFilter}
  */
 @Override
 protected TokenStreamComponents createComponents(String fieldName) {
   final Tokenizer source = new StandardTokenizer();
   TokenStream result = new StandardFilter(source);
   result = new LowerCaseFilter(result);
   result = new StopFilter(result, stopwords);
   if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
   result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
   return new TokenStreamComponents(source, result);
 }
 @Override
 public final boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     if (keywordSet.contains(termAtt.buffer(), 0, termAtt.length())) {
       keywordAttr.setKeyword(true);
     }
     return true;
   } else {
     return false;
   }
 }
 /**
  * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
  * the text in the provided {@link Reader}.
  *
  * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
  *     {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link
  *     IrishLowerCaseFilter}, {@link StopFilter} , {@link SetKeywordMarkerFilter} if a stem
  *     exclusion set is provided and {@link SnowballFilter}.
  */
 @Override
 protected TokenStreamComponents createComponents(String fieldName) {
   final Tokenizer source = new StandardTokenizer();
   TokenStream result = new StandardFilter(source);
   result = new StopFilter(result, HYPHENATIONS);
   result = new ElisionFilter(result, DEFAULT_ARTICLES);
   result = new IrishLowerCaseFilter(result);
   result = new StopFilter(result, stopwords);
   if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
   result = new SnowballFilter(result, new IrishStemmer());
   return new TokenStreamComponents(source, result);
 }
 @Override
 public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) {
   TokenStream stream = new StandardFilter(matchVersion, tokenizer);
   if (caseInsensitive) stream = new LowerCaseFilter(matchVersion, stream);
   if (useStopWords)
     stream = new StopFilter(matchVersion, stream, SpanishAnalyzer.getDefaultStopSet());
   if (useStem) {
     if (!stemExclusionSet.isEmpty())
       stream = new SetKeywordMarkerFilter(stream, stemExclusionSet);
     stream = new SpanishLightStemFilter(stream);
   }
   return stream;
 }
  private CharArrayMap convertPhraseSet(CharArraySet phraseSet) {
    CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false);
    Iterator<Object> phraseIt = phraseSet.iterator();
    while (phraseIt != null && phraseIt.hasNext()) {
      char[] phrase = (char[]) phraseIt.next();

      Log.debug("'" + new String(phrase) + "'");

      char[] firstTerm = getFirstTerm(phrase);
      Log.debug("'" + new String(firstTerm) + "'");

      CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length);
      if (itsPhrases == null) {
        itsPhrases = new CharArraySet(5, false);
        phraseMap.put(new String(firstTerm), itsPhrases);
      }

      itsPhrases.add(phrase);
    }

    return phraseMap;
  }
 /**
  * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all
  * the text in the provided {@link Reader}.
  *
  * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an
  *     {@link StandardTokenizer} filtered with {@link StandardFilter}, {@link LowerCaseFilter},
  *     {@link StopFilter} , {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and
  *     {@link SpanishLightStemFilter}.
  */
 @Override
 protected TokenStreamComponents createComponents(String fieldName) {
   final Tokenizer source;
   if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) {
     source = new StandardTokenizer();
   } else {
     source = new StandardTokenizer40();
   }
   TokenStream result = new StandardFilter(source);
   result = new LowerCaseFilter(result);
   result = new StopFilter(result, stopwords);
   if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
   result = new SpanishLightStemFilter(result);
   return new TokenStreamComponents(source, result);
 }
 /**
  * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
  * the text in the provided {@link Reader}.
  *
  * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an {@link
  *     StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
  *     {@link StopFilter}, {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a
  *     stem exclusion set is provided and {@link ArabicStemFilter}.
  */
 @Override
 protected TokenStreamComponents createComponents(String fieldName) {
   final Tokenizer source = new StandardTokenizer();
   TokenStream result = new LowerCaseFilter(source);
   if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
     result = new DecimalDigitFilter(result);
   }
   // the order here is important: the stopword list is not normalized!
   result = new StopFilter(result, stopwords);
   // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
   result = new ArabicNormalizationFilter(result);
   if (!stemExclusionSet.isEmpty()) {
     result = new SetKeywordMarkerFilter(result, stemExclusionSet);
   }
   return new TokenStreamComponents(source, new ArabicStemFilter(result));
 }
 /**
  * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
  * the text in the provided {@link Reader}.
  *
  * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
  *     StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
  *     {@link HindiNormalizationFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is
  *     provided, {@link HindiStemFilter}, and Hindi Stop words
  */
 @Override
 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
   final Tokenizer source;
   if (matchVersion.onOrAfter(Version.LUCENE_36)) {
     source = new StandardTokenizer(matchVersion, reader);
   } else {
     source = new IndicTokenizer(matchVersion, reader);
   }
   TokenStream result = new LowerCaseFilter(matchVersion, source);
   if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet);
   result = new IndicNormalizationFilter(result);
   result = new HindiNormalizationFilter(result);
   result = new StopFilter(matchVersion, result, stopwords);
   result = new HindiStemFilter(result);
   return new TokenStreamComponents(source, result);
 }
  @Inject
  public WordDelimiterTokenFilterFactory(
      Index index,
      @IndexSettings Settings indexSettings,
      Environment env,
      @Assisted String name,
      @Assisted Settings settings) {
    super(index, indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
      this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
      this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words", version);
    this.protoWords =
        protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
    this.flags = flags;
  }
Esempio n. 21
0
  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final StreamLemmasFilter src =
        new StreamLemmasFilter(reader, hebMorphLemmatizer, commonWords, lemmaFilter);
    src.setAlwaysSaveMarkedOriginal(alwaysSaveMarkedOriginal);
    src.setSuffixForExactMatch(suffixForExactMatch);

    TokenStream tok = new SynonymFilter(src, acronymMergingMap, false);
    if (commonWords != null && commonWords.size() > 0)
      tok = new CommonGramsFilter(matchVersion, tok, commonWords, false);
    tok = new SuffixKeywordFilter(tok, '$');
    return new TokenStreamComponents(src, tok) {
      @Override
      protected void setReader(final Reader reader) throws IOException {
        super.setReader(reader);
      }
    };
  }
  private static class DefaultSetHolder {
    /** @deprecated (3.1) remove this for Lucene 5.0 */
    @Deprecated
    static final CharArraySet DEFAULT_STOP_SET_30 =
        CharArraySet.unmodifiableSet(
            new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(RUSSIAN_STOP_WORDS_30), false));

    static final CharArraySet DEFAULT_STOP_SET;

    static {
      try {
        DEFAULT_STOP_SET =
            WordlistLoader.getSnowballWordSet(
                IOUtils.getDecodingReader(
                    SnowballFilter.class, DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8),
                Version.LUCENE_CURRENT);
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
        throw new RuntimeException("Unable to load default stopword set", ex);
      }
    }
  }
 private char[] getFirst(CharArraySet charSet) {
   if (charSet.isEmpty()) return null;
   Iterator<Object> phraseIt = charSet.iterator();
   return (char[]) phraseIt.next();
 }
 /**
  * Builds an analyzer with the given stop words and a stem exclusion set. If a stem exclusion set
  * is provided this analyzer will add a {@link KeywordMarkerFilter} before {@link
  * BulgarianStemFilter}.
  */
 public BulgarianAnalyzer(
     Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
   super(matchVersion, stopwords);
   this.stemExclusionSet =
       CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }
 @Override
 public boolean matches(char s[], int len) {
   return super.matches(s, len) && !exceptions.contains(s, 0, len);
 }
 /**
  * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided
  * this analyzer will add a {@link SetKeywordMarkerFilter} before stemming.
  *
  * @param stopwords a stopword set
  * @param stemExclusionSet a set of terms not to be stemmed
  */
 public SpanishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
   super(stopwords);
   this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
 }
Esempio n. 27
0
 /**
  * Builds an analyzer with the given stop words and a set of work to be excluded from the {@link
  * CzechStemFilter}.
  *
  * @param matchVersion Lucene version to match
  * @param stopwords a stopword set
  * @param stemExclusionTable a stemming exclusion set
  */
 public CzechAnalyzer(
     Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) {
   super(matchVersion, stopwords);
   this.stemExclusionTable =
       CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
 }
Esempio n. 28
0
  @Override
  public boolean incrementToken() throws IOException {
    while (true) {
      if (!hasSavedState) {
        // process a new input word
        if (!input.incrementToken()) {
          return false;
        }

        int termLength = termAttribute.length();
        char[] termBuffer = termAttribute.buffer();

        accumPosInc += posIncAttribute.getPositionIncrement();

        iterator.setText(termBuffer, termLength);
        iterator.next();

        // word of no delimiters, or protected word: just return it
        if ((iterator.current == 0 && iterator.end == termLength)
            || (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
          posIncAttribute.setPositionIncrement(accumPosInc);
          accumPosInc = 0;
          first = false;
          return true;
        }

        // word of simply delimiters
        if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) {
          // if the posInc is 1, simply ignore it in the accumulation
          // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous
          // logic!
          if (posIncAttribute.getPositionIncrement() == 1 && !first) {
            accumPosInc--;
          }
          continue;
        }

        saveState();

        hasOutputToken = false;
        hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL);
        lastConcatCount = 0;

        if (has(PRESERVE_ORIGINAL)) {
          posIncAttribute.setPositionIncrement(accumPosInc);
          accumPosInc = 0;
          first = false;
          return true;
        }
      }

      // at the end of the string, output any concatenations
      if (iterator.end == WordDelimiterIterator.DONE) {
        if (!concat.isEmpty()) {
          if (flushConcatenation(concat)) {
            buffer();
            continue;
          }
        }

        if (!concatAll.isEmpty()) {
          // only if we haven't output this same combo above!
          if (concatAll.subwordCount > lastConcatCount) {
            concatAll.writeAndClear();
            buffer();
            continue;
          }
          concatAll.clear();
        }

        if (bufferedPos < bufferedLen) {
          if (bufferedPos == 0) {
            sorter.sort(0, bufferedLen);
          }
          clearAttributes();
          restoreState(buffered[bufferedPos++]);
          if (first && posIncAttribute.getPositionIncrement() == 0) {
            // can easily happen with strange combinations (e.g. not outputting numbers, but
            // concat-all)
            posIncAttribute.setPositionIncrement(1);
          }
          first = false;
          return true;
        }

        // no saved concatenations, on to the next input word
        bufferedPos = bufferedLen = 0;
        hasSavedState = false;
        continue;
      }

      // word surrounded by delimiters: always output
      if (iterator.isSingleWord()) {
        generatePart(true);
        iterator.next();
        first = false;
        return true;
      }

      int wordType = iterator.type();

      // do we already have queued up incompatible concatenations?
      if (!concat.isEmpty() && (concat.type & wordType) == 0) {
        if (flushConcatenation(concat)) {
          hasOutputToken = false;
          buffer();
          continue;
        }
        hasOutputToken = false;
      }

      // add subwords depending upon options
      if (shouldConcatenate(wordType)) {
        if (concat.isEmpty()) {
          concat.type = wordType;
        }
        concatenate(concat);
      }

      // add all subwords (catenateAll)
      if (has(CATENATE_ALL)) {
        concatenate(concatAll);
      }

      // if we should output the word or number part
      if (shouldGenerateParts(wordType)) {
        generatePart(false);
        buffer();
      }

      iterator.next();
    }
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!emitSingleTokens && unusedTokens.size() > 0) {
      Log.debug("emitting unused phrases");
      // emit these until the queue is empty before emitting any new stuff
      Token aToken = unusedTokens.remove(0);
      emit(aToken);
      return true;
    }

    if (lastToken != null) {
      emit(lastToken);
      lastToken = null;
      return true;
    }

    char[] nextToken = nextToken();
    // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken ));
    if (nextToken == null) {
      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        char[] phrase = getFirst(currentSetToCheck);
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (phrase != null && endsWith(lastTok, phrase)) {
          currentSetToCheck = remove(currentSetToCheck, phrase);
          emit(phrase);
          return true;
        }
      } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) {
        if (lastEmitted != null
            && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }

      if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) {
        char[] lastTok = getCurrentBuffer(new char[0]);
        if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) {
          emit(lastTok);
          currentPhrase.setLength(0);
          return true;
        } else if (!emitSingleTokens) {
          discardCharTokens(currentPhrase, unusedTokens);
          currentSetToCheck = null;
          currentPhrase.setLength(0);
          if (unusedTokens.size() > 0) {
            Token aToken = unusedTokens.remove(0);
            Log.debug("emitting putback token");
            emit(aToken);
            return true;
          }
        }
      }
      return false;
    }

    // if emitSingleToken, set lastToken = nextToken
    if (emitSingleTokens) {
      lastToken = nextToken;
    }

    if (currentSetToCheck == null || currentSetToCheck.size() == 0) {
      Log.debug("Checking for phrase start on '" + new String(nextToken) + "'");

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentSetTocheck
        currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
        if (currentPhrase == null) currentPhrase = new StringBuffer();
        else currentPhrase.setLength(0);
        currentPhrase.append(nextToken);
        return incrementToken();
      } else {
        emit(nextToken);
        // clear lastToken
        lastToken = null;
        return true;
      }
    } else {
      // add token to the current string buffer.
      char[] currentBuffer = getCurrentBuffer(nextToken);

      if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) {
        // if its the only one valid, emit it
        // if there is a longer one, wait to see if it will be matched
        // if the longer one breaks on the next token, emit this one...
        // emit the current phrase
        currentSetToCheck = remove(currentSetToCheck, currentBuffer);

        if (currentSetToCheck.size() == 0) {
          emit(currentBuffer);
          lastValid = null;
          --positionIncr;
        } else {
          if (emitSingleTokens) {
            lastToken = currentBuffer;
            return true;
          }
          lastValid = currentBuffer;
        }

        if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
          // get the phrase set for this token, add it to currentPhrasesTocheck
          currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length);
          if (currentPhrase == null) currentPhrase = new StringBuffer();
          else currentPhrase.setLength(0);
          currentPhrase.append(nextToken);
        }

        return (lastValid != null) ? incrementToken() : true;
      }

      if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) {
        // get the phrase set for this token, add it to currentPhrasesTocheck
        // System.out.println( "starting new phrase with " + new String( nextToken ) );
        // does this add all of the set? if not need iterator loop
        CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length);
        Iterator<Object> phraseIt = newSet.iterator();
        while (phraseIt != null && phraseIt.hasNext()) {
          char[] phrase = (char[]) phraseIt.next();
          currentSetToCheck.add(phrase);
        }
      }

      // for each phrase in currentSetToCheck -
      // if there is a phrase prefix match, get the next token recursively
      Iterator<Object> phraseIt = currentSetToCheck.iterator();
      while (phraseIt != null && phraseIt.hasNext()) {
        char[] phrase = (char[]) phraseIt.next();

        if (startsWith(phrase, currentBuffer)) {
          return incrementToken();
        }
      }

      if (lastValid != null) {
        emit(lastValid);
        lastValid = null;
        return true;
      }

      if (!emitSingleTokens) {
        // current phrase didn't match fully: put the tokens back
        // into the unusedTokens list
        discardCharTokens(currentPhrase, unusedTokens);
        currentPhrase.setLength(0);
        currentSetToCheck = null;

        if (unusedTokens.size() > 0) {
          Token aToken = unusedTokens.remove(0);
          Log.debug("emitting putback token");
          emit(aToken);
          return true;
        }
      }
      currentSetToCheck = null;

      Log.debug("returning at end.");
      return incrementToken();
    }
  }
Esempio n. 30
0
 /**
  * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is provided
  * this analyzer will add a {@link KeywordMarkerFilter} before stemming.
  *
  * @param matchVersion lucene compatibility version
  * @param stopwords a stopword set
  * @param stemExclusionSet a set of terms not to be stemmed
  */
 public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
   super(matchVersion, stopwords);
   this.stemExclusionSet =
       CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
 }