static {
   final List<String> stopWords =
       Arrays.asList(
           "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is",
           "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there",
           "these", "they", "this", "to", "was", "will", "with");
   final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, stopWords.size(), false);
   stopSet.addAll(stopWords);
   ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
 }
  /** Test the static #copy() function with a JDK {@link Set} as a source */
  public void testCopyJDKSet() {
    Set<String> set = new HashSet<>();

    List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
    List<String> stopwordsUpper = new ArrayList<>();
    for (String string : stopwords) {
      stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
    }
    set.addAll(Arrays.asList(TEST_STOP_WORDS));

    CharArraySet copy = CharArraySet.copy(TEST_VERSION_CURRENT, set);

    assertEquals(set.size(), copy.size());
    assertEquals(set.size(), copy.size());

    assertTrue(copy.containsAll(stopwords));
    for (String string : stopwordsUpper) {
      assertFalse(copy.contains(string));
    }

    List<String> newWords = new ArrayList<>();
    for (String string : stopwords) {
      newWords.add(string + "_1");
    }
    copy.addAll(newWords);

    assertTrue(copy.containsAll(stopwords));
    assertTrue(copy.containsAll(newWords));
    // new added terms are not in the source set
    for (String string : newWords) {
      assertFalse(set.contains(string));
    }
  }
  /**
   * @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is no longer
   *     needed.
   */
  @Deprecated
  public void testSingleHighSurrogateBWComapt() {
    String missing = "Term %s is missing in the set";
    String falsePos = "Term %s is in the set but shouldn't";
    String[] upperArr =
        new String[] {"ABC\uD800", "ABC\uD800EfG", "\uD800EfG", "\uD800\ud801\udc1cB"};

    String[] lowerArr =
        new String[] {"abc\uD800", "abc\uD800efg", "\uD800efg", "\uD800\ud801\udc44b"};
    CharArraySet set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), true);
    for (String upper : upperArr) {
      set.add(upper);
    }
    for (int i = 0; i < upperArr.length; i++) {
      assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
      if (i == lowerArr.length - 1)
        assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
      else assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i]));
    }
    set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), false);
    for (String upper : upperArr) {
      set.add(upper);
    }
    for (int i = 0; i < upperArr.length; i++) {
      assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
      assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
    }
  }
  public void testToString() {
    CharArraySet set = CharArraySet.copy(TEST_VERSION_CURRENT, Collections.singleton("test"));
    assertEquals("[test]", set.toString());
    set.add("test2");
    assertTrue(set.toString().contains(", "));

    set = CharArraySet.copy(Version.LUCENE_3_0, Collections.singleton("test"));
    assertEquals("[test]", set.toString());
    set.add("test2");
    assertTrue(set.toString().contains(", "));
  }
 public void testClear() {
   CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
   set.addAll(Arrays.asList(TEST_STOP_WORDS));
   assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
   set.clear();
   assertEquals("not empty", 0, set.size());
   for (int i = 0; i < TEST_STOP_WORDS.length; i++) assertFalse(set.contains(TEST_STOP_WORDS[i]));
   set.addAll(Arrays.asList(TEST_STOP_WORDS));
   assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
   for (int i = 0; i < TEST_STOP_WORDS.length; i++) assertTrue(set.contains(TEST_STOP_WORDS[i]));
 }
 /**
  * Returns as {@link CharArraySet} from wordFiles, which can be a comma-separated list of
  * filenames
  */
 protected final CharArraySet getWordSet(
     ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException {
   List<String> files = splitFileNames(wordFiles);
   CharArraySet words = null;
   if (files.size() > 0) {
     // default stopwords list has 35 or so words, but maybe don't make it that
     // big to start
     words = new CharArraySet(files.size() * 10, ignoreCase);
     for (String file : files) {
       List<String> wlist = getLines(loader, file.trim());
       words.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
     }
   }
   return words;
 }
 /**
  * @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is no longer
  *     needed.
  */
 @Deprecated
 public void testSupplementaryCharsBWCompat() {
   String missing = "Term %s is missing in the set";
   String falsePos = "Term %s is in the set but shouldn't";
   // for reference see
   // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
   String[] upperArr =
       new String[] {"Abc\ud801\udc1c", "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
   String[] lowerArr =
       new String[] {"abc\ud801\udc44", "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
   CharArraySet set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), true);
   for (String upper : upperArr) {
     set.add(upper);
   }
   for (int i = 0; i < upperArr.length; i++) {
     assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
     assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
   }
   set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), false);
   for (String upper : upperArr) {
     set.add(upper);
   }
   for (int i = 0; i < upperArr.length; i++) {
     assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i]));
     assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i]));
   }
 }
 /** Test for NPE */
 public void testContainsWithNull() {
   CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
   try {
     set.contains((char[]) null, 0, 10);
     fail("null value must raise NPE");
   } catch (NullPointerException e) {
   }
   try {
     set.contains((CharSequence) null);
     fail("null value must raise NPE");
   } catch (NullPointerException e) {
   }
   try {
     set.contains((Object) null);
     fail("null value must raise NPE");
   } catch (NullPointerException e) {
   }
 }
  @Override
  public void init(Map<String, String> args) {
    super.init(args);

    String k = args.get(KEEP);
    if (k != null) {
      StringTokenizer st = new StringTokenizer(k);
      boolean ignoreCase = false;
      String ignoreStr = args.get(KEEP_IGNORE_CASE);
      if ("true".equalsIgnoreCase(ignoreStr)) {
        ignoreCase = true;
      }
      keep = new CharArraySet(10, ignoreCase);
      while (st.hasMoreTokens()) {
        k = st.nextToken().trim();
        keep.add(k.toCharArray());
      }
    }

    k = args.get(OK_PREFIX);
    if (k != null) {
      okPrefix = new ArrayList<char[]>();
      StringTokenizer st = new StringTokenizer(k);
      while (st.hasMoreTokens()) {
        okPrefix.add(st.nextToken().trim().toCharArray());
      }
    }

    k = args.get(MIN_WORD_LENGTH);
    if (k != null) {
      minWordLength = Integer.valueOf(k);
    }

    k = args.get(MAX_WORD_COUNT);
    if (k != null) {
      maxWordCount = Integer.valueOf(k);
    }

    k = args.get(MAX_TOKEN_LENGTH);
    if (k != null) {
      maxTokenLength = Integer.valueOf(k);
    }

    k = args.get(ONLY_FIRST_WORD);
    if (k != null) {
      onlyFirstWord = Boolean.valueOf(k);
    }

    k = args.get(FORCE_FIRST_LETTER);
    if (k != null) {
      forceFirstLetter = Boolean.valueOf(k);
    }
  }
  public void testNonZeroOffset() {
    String[] words = {"Hello", "World", "this", "is", "a", "test"};
    char[] findme = "xthisy".toCharArray();
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
    set.addAll(Arrays.asList(words));
    assertTrue(set.contains(findme, 1, 4));
    assertTrue(set.contains(new String(findme, 1, 4)));

    // test unmodifiable
    set = CharArraySet.unmodifiableSet(set);
    assertTrue(set.contains(findme, 1, 4));
    assertTrue(set.contains(new String(findme, 1, 4)));
  }
  public void processWord(char[] buffer, int offset, int length, int wordCount) {
    if (length < 1) {
      return;
    }
    if (onlyFirstWord && wordCount > 0) {
      for (int i = 0; i < length; i++) {
        buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
      }
      return;
    }

    if (keep != null && keep.contains(buffer, offset, length)) {
      if (wordCount == 0 && forceFirstLetter) {
        buffer[offset] = Character.toUpperCase(buffer[offset]);
      }
      return;
    }
    if (length < minWordLength) {
      return;
    }
    for (char[] prefix : okPrefix) {
      if (length
          >= prefix.length) { // don't bother checking if the buffer length is less than the prefix
        boolean match = true;
        for (int i = 0; i < prefix.length; i++) {
          if (prefix[i] != buffer[offset + i]) {
            match = false;
            break;
          }
        }
        if (match == true) {
          return;
        }
      }
    }

    // We know it has at least one character
    /*char[] chars = w.toCharArray();
    StringBuilder word = new StringBuilder( w.length() );
    word.append( Character.toUpperCase( chars[0] ) );*/
    buffer[offset] = Character.toUpperCase(buffer[offset]);

    for (int i = 1; i < length; i++) {
      buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
    }
    // return word.toString();
  }
 public void testObjectContains() {
   CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
   Integer val = Integer.valueOf(1);
   set.add(val);
   assertTrue(set.contains(val));
   assertTrue(set.contains(new Integer(1))); // another integer
   assertTrue(set.contains("1"));
   assertTrue(set.contains(new char[] {'1'}));
   // test unmodifiable
   set = CharArraySet.unmodifiableSet(set);
   assertTrue(set.contains(val));
   assertTrue(set.contains(new Integer(1))); // another integer
   assertTrue(set.contains("1"));
   assertTrue(set.contains(new char[] {'1'}));
 }
 /**
  * Tests a special case of {@link CharArraySet#copy(Version, Set)} where the set to copy is the
  * {@link CharArraySet#EMPTY_SET}
  */
 public void testCopyEmptySet() {
   assertSame(
       CharArraySet.EMPTY_SET, CharArraySet.copy(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET));
 }
 public void testRehash() throws Exception {
   CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 0, true);
   for (int i = 0; i < TEST_STOP_WORDS.length; i++) cas.add(TEST_STOP_WORDS[i]);
   assertEquals(TEST_STOP_WORDS.length, cas.size());
   for (int i = 0; i < TEST_STOP_WORDS.length; i++) assertTrue(cas.contains(TEST_STOP_WORDS[i]));
 }
  public void testUnmodifiableSet() {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
    set.addAll(Arrays.asList(TEST_STOP_WORDS));
    set.add(Integer.valueOf(1));
    final int size = set.size();
    set = CharArraySet.unmodifiableSet(set);
    assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
    for (String stopword : TEST_STOP_WORDS) {
      assertTrue(set.contains(stopword));
    }
    assertTrue(set.contains(Integer.valueOf(1)));
    assertTrue(set.contains("1"));
    assertTrue(set.contains(new char[] {'1'}));

    try {
      CharArraySet.unmodifiableSet(null);
      fail("can not make null unmodifiable");
    } catch (NullPointerException e) {
      // expected
    }
  }
Beispiel #16
0
/**
 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link LowerCaseFilter} and {@link
 * StopFilter}, using a list of English stop words.
 *
 * <p><a name="version"/>
 *
 * <p>You must specify the required {@link Version} compatibility when creating StandardAnalyzer:
 *
 * <ul>
 *   <li>As of 3.4, Hiragana and Han characters are no longer wrongly split from their combining
 *       characters. If you use a previous version number, you get the exact broken behavior for
 *       backwards compatibility.
 *   <li>As of 3.1, StandardTokenizer implements Unicode text segmentation, and StopFilter correctly
 *       handles Unicode 4.0 supplementary characters in stopwords. {@link ClassicTokenizer} and
 *       {@link ClassicAnalyzer} are the pre-3.1 implementations of StandardTokenizer and
 *       StandardAnalyzer.
 *   <li>As of 2.9, StopFilter preserves position increments
 *   <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see <a
 *       href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
 * </ul>
 */
public final class PhaidraAnalyzer extends StopwordAnalyzerBase {

  /** Default maximum allowed token length */
  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;

  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

  /**
   * Specifies whether deprecated acronyms should be replaced with HOST type. See {@linkplain
   * "https://issues.apache.org/jira/browse/LUCENE-1068"}
   */
  private final boolean replaceInvalidAcronym;

  /**
   * An unmodifiable set containing some common English words that are usually not useful for
   * searching.
   */
  public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

  /**
   * Builds an analyzer with the given stop words.
   *
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   * @param stopWords stop words
   */
  public PhaidraAnalyzer(Version matchVersion, Set<?> stopWords) {
    super(matchVersion, stopWords);
    replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_33);
  }

  /**
   * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
   *
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   */
  public PhaidraAnalyzer(Version matchVersion) {
    this(matchVersion, STOP_WORDS_SET);
  }

  /**
   * Builds an analyzer with the stop words from the given file.
   *
   * @see WordlistLoader#getWordSet(Reader, Version)
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   * @param stopwords File to read stop words from
   */
  public PhaidraAnalyzer(Version matchVersion, File stopwords) throws IOException {
    this(
        matchVersion,
        WordlistLoader.getWordSet(
            IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion));
  }

  /**
   * Builds an analyzer with the stop words from the given reader.
   *
   * @see WordlistLoader#getWordSet(Reader, Version)
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   * @param stopwords Reader to read stop words from
   */
  public PhaidraAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
    this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
  }

  /**
   * Set maximum allowed token length. If a token is seen that exceeds this length then it is
   * discarded. This setting only takes effect the next time tokenStream or reusableTokenStream is
   * called.
   */
  public void setMaxTokenLength(int length) {
    maxTokenLength = length;
  }

  /** @see #setMaxTokenLength */
  public int getMaxTokenLength() {
    return maxTokenLength;
  }

  private static final CharArraySet DEFAULT_ARTICLES =
      CharArraySet.unmodifiableSet(
          new CharArraySet(
              Version.LUCENE_33,
              Arrays.asList(
                  "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", "gl", "agl",
                  "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"),
              true));

  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tok = new StandardFilter(matchVersion, src);
    // unipd adding ElisionFilter for apostrophes
    tok = new ElisionFilter(matchVersion, tok, DEFAULT_ARTICLES);
    tok = new LowerCaseFilter(matchVersion, tok);
    tok = new StopFilter(matchVersion, tok, stopwords);
    // rasta: adding ASCIIFoldingFilter to enable search for accent
    tok = new ASCIIFoldingFilter(tok);
    return new TokenStreamComponents(src, tok) {
      @Override
      protected boolean reset(final Reader reader) throws IOException {
        src.setMaxTokenLength(PhaidraAnalyzer.this.maxTokenLength);
        return super.reset(reader);
      }
    };
  }
}
  public void testModifyOnUnmodifiable() {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
    set.addAll(Arrays.asList(TEST_STOP_WORDS));
    final int size = set.size();
    set = CharArraySet.unmodifiableSet(set);
    assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
    String NOT_IN_SET = "SirGallahad";
    assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));

    try {
      set.add(NOT_IN_SET.toCharArray());
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.add(NOT_IN_SET);
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.add(new StringBuilder(NOT_IN_SET));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.clear();
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }
    try {
      set.add((Object) NOT_IN_SET);
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    // This test was changed in 3.1, as a contains() call on the given Collection using the
    // "correct" iterator's
    // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor
    // never call
    // remove() on the iterator
    try {
      set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), true));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(NOT_IN_SET), true));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.addAll(Arrays.asList(NOT_IN_SET));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
    }

    for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
      assertTrue(set.contains(TEST_STOP_WORDS[i]));
    }
  }
  /** Test the static #copy() function with a CharArraySet as a source */
  public void testCopyCharArraySet() {
    CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
    CharArraySet setCaseSensitive = new CharArraySet(TEST_VERSION_CURRENT, 10, false);

    List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
    List<String> stopwordsUpper = new ArrayList<>();
    for (String string : stopwords) {
      stopwordsUpper.add(string.toUpperCase(Locale.ROOT));
    }
    setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
    setIngoreCase.add(Integer.valueOf(1));
    setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
    setCaseSensitive.add(Integer.valueOf(1));

    CharArraySet copy = CharArraySet.copy(TEST_VERSION_CURRENT, setIngoreCase);
    CharArraySet copyCaseSens = CharArraySet.copy(TEST_VERSION_CURRENT, setCaseSensitive);

    assertEquals(setIngoreCase.size(), copy.size());
    assertEquals(setCaseSensitive.size(), copy.size());

    assertTrue(copy.containsAll(stopwords));
    assertTrue(copy.containsAll(stopwordsUpper));
    assertTrue(copyCaseSens.containsAll(stopwords));
    for (String string : stopwordsUpper) {
      assertFalse(copyCaseSens.contains(string));
    }
    // test adding terms to the copy
    List<String> newWords = new ArrayList<>();
    for (String string : stopwords) {
      newWords.add(string + "_1");
    }
    copy.addAll(newWords);

    assertTrue(copy.containsAll(stopwords));
    assertTrue(copy.containsAll(stopwordsUpper));
    assertTrue(copy.containsAll(newWords));
    // new added terms are not in the source set
    for (String string : newWords) {
      assertFalse(setIngoreCase.contains(string));
      assertFalse(setCaseSensitive.contains(string));
    }
  }