예제 #1
0
 static {
   final List<String> stopWords =
       Arrays.asList(
           "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is",
           "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there",
           "these", "they", "this", "to", "was", "will", "with");
   final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, stopWords.size(), false);
   stopSet.addAll(stopWords);
   ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
 }
예제 #2
0
  public void testUnmodifiableSet() {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
    set.addAll(Arrays.asList(TEST_STOP_WORDS));
    set.add(Integer.valueOf(1));
    final int size = set.size();
    set = CharArraySet.unmodifiableSet(set);
    assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
    for (String stopword : TEST_STOP_WORDS) {
      assertTrue(set.contains(stopword));
    }
    assertTrue(set.contains(Integer.valueOf(1)));
    assertTrue(set.contains("1"));
    assertTrue(set.contains(new char[] {'1'}));

    try {
      CharArraySet.unmodifiableSet(null);
      fail("can not make null unmodifiable");
    } catch (NullPointerException e) {
      // expected
    }
  }
예제 #3
0
  public void testNonZeroOffset() {
    String[] words = {"Hello", "World", "this", "is", "a", "test"};
    char[] findme = "xthisy".toCharArray();
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
    set.addAll(Arrays.asList(words));
    assertTrue(set.contains(findme, 1, 4));
    assertTrue(set.contains(new String(findme, 1, 4)));

    // test unmodifiable
    set = CharArraySet.unmodifiableSet(set);
    assertTrue(set.contains(findme, 1, 4));
    assertTrue(set.contains(new String(findme, 1, 4)));
  }
예제 #4
0
 public void testObjectContains() {
   CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
   Integer val = Integer.valueOf(1);
   set.add(val);
   assertTrue(set.contains(val));
   assertTrue(set.contains(new Integer(1))); // another integer
   assertTrue(set.contains("1"));
   assertTrue(set.contains(new char[] {'1'}));
   // test unmodifiable
   set = CharArraySet.unmodifiableSet(set);
   assertTrue(set.contains(val));
   assertTrue(set.contains(new Integer(1))); // another integer
   assertTrue(set.contains("1"));
   assertTrue(set.contains(new char[] {'1'}));
 }
예제 #5
0
  public void testModifyOnUnmodifiable() {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
    set.addAll(Arrays.asList(TEST_STOP_WORDS));
    final int size = set.size();
    set = CharArraySet.unmodifiableSet(set);
    assertEquals("Set size changed due to unmodifiableSet call", size, set.size());
    String NOT_IN_SET = "SirGallahad";
    assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));

    try {
      set.add(NOT_IN_SET.toCharArray());
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.add(NOT_IN_SET);
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.add(new StringBuilder(NOT_IN_SET));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.clear();
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }
    try {
      set.add((Object) NOT_IN_SET);
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    // This test was changed in 3.1, as a contains() call on the given Collection using the
    // "correct" iterator's
    // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor
    // never call
    // remove() on the iterator
    try {
      set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), true));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(NOT_IN_SET), true));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertEquals("Size of unmodifiable set has changed", size, set.size());
    }

    try {
      set.addAll(Arrays.asList(NOT_IN_SET));
      fail("Modified unmodifiable set");
    } catch (UnsupportedOperationException e) {
      // expected
      assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
    }

    for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
      assertTrue(set.contains(TEST_STOP_WORDS[i]));
    }
  }
예제 #6
0
/**
 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link LowerCaseFilter} and {@link
 * StopFilter}, using a list of English stop words.
 *
 * <p><a name="version"/>
 *
 * <p>You must specify the required {@link Version} compatibility when creating StandardAnalyzer:
 *
 * <ul>
 *   <li>As of 3.4, Hiragana and Han characters are no longer wrongly split from their combining
 *       characters. If you use a previous version number, you get the exact broken behavior for
 *       backwards compatibility.
 *   <li>As of 3.1, StandardTokenizer implements Unicode text segmentation, and StopFilter correctly
 *       handles Unicode 4.0 supplementary characters in stopwords. {@link ClassicTokenizer} and
 *       {@link ClassicAnalyzer} are the pre-3.1 implementations of StandardTokenizer and
 *       StandardAnalyzer.
 *   <li>As of 2.9, StopFilter preserves position increments
 *   <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see <a
 *       href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
 * </ul>
 */
public final class PhaidraAnalyzer extends StopwordAnalyzerBase {

  /** Default maximum allowed token length */
  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;

  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

  /**
   * Specifies whether deprecated acronyms should be replaced with HOST type. See {@linkplain
   * "https://issues.apache.org/jira/browse/LUCENE-1068"}
   */
  private final boolean replaceInvalidAcronym;

  /**
   * An unmodifiable set containing some common English words that are usually not useful for
   * searching.
   */
  public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

  /**
   * Builds an analyzer with the given stop words.
   *
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   * @param stopWords stop words
   */
  public PhaidraAnalyzer(Version matchVersion, Set<?> stopWords) {
    super(matchVersion, stopWords);
    replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_33);
  }

  /**
   * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
   *
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   */
  public PhaidraAnalyzer(Version matchVersion) {
    this(matchVersion, STOP_WORDS_SET);
  }

  /**
   * Builds an analyzer with the stop words from the given file.
   *
   * @see WordlistLoader#getWordSet(Reader, Version)
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   * @param stopwords File to read stop words from
   */
  public PhaidraAnalyzer(Version matchVersion, File stopwords) throws IOException {
    this(
        matchVersion,
        WordlistLoader.getWordSet(
            IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion));
  }

  /**
   * Builds an analyzer with the stop words from the given reader.
   *
   * @see WordlistLoader#getWordSet(Reader, Version)
   * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>}
   * @param stopwords Reader to read stop words from
   */
  public PhaidraAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
    this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion));
  }

  /**
   * Set maximum allowed token length. If a token is seen that exceeds this length then it is
   * discarded. This setting only takes effect the next time tokenStream or reusableTokenStream is
   * called.
   */
  public void setMaxTokenLength(int length) {
    maxTokenLength = length;
  }

  /** @see #setMaxTokenLength */
  public int getMaxTokenLength() {
    return maxTokenLength;
  }

  private static final CharArraySet DEFAULT_ARTICLES =
      CharArraySet.unmodifiableSet(
          new CharArraySet(
              Version.LUCENE_33,
              Arrays.asList(
                  "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", "gl", "agl",
                  "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"),
              true));

  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tok = new StandardFilter(matchVersion, src);
    // unipd adding ElisionFilter for apostrophes
    tok = new ElisionFilter(matchVersion, tok, DEFAULT_ARTICLES);
    tok = new LowerCaseFilter(matchVersion, tok);
    tok = new StopFilter(matchVersion, tok, stopwords);
    // rasta: adding ASCIIFoldingFilter to enable search for accent
    tok = new ASCIIFoldingFilter(tok);
    return new TokenStreamComponents(src, tok) {
      @Override
      protected boolean reset(final Reader reader) throws IOException {
        src.setMaxTokenLength(PhaidraAnalyzer.this.maxTokenLength);
        return super.reset(reader);
      }
    };
  }
}