static { final List<String> stopWords = Arrays.asList( "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"); final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, stopWords.size(), false); stopSet.addAll(stopWords); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); }
/** Test the static #copy() function with a JDK {@link Set} as a source */ public void testCopyJDKSet() { Set<String> set = new HashSet<>(); List<String> stopwords = Arrays.asList(TEST_STOP_WORDS); List<String> stopwordsUpper = new ArrayList<>(); for (String string : stopwords) { stopwordsUpper.add(string.toUpperCase(Locale.ROOT)); } set.addAll(Arrays.asList(TEST_STOP_WORDS)); CharArraySet copy = CharArraySet.copy(TEST_VERSION_CURRENT, set); assertEquals(set.size(), copy.size()); assertEquals(set.size(), copy.size()); assertTrue(copy.containsAll(stopwords)); for (String string : stopwordsUpper) { assertFalse(copy.contains(string)); } List<String> newWords = new ArrayList<>(); for (String string : stopwords) { newWords.add(string + "_1"); } copy.addAll(newWords); assertTrue(copy.containsAll(stopwords)); assertTrue(copy.containsAll(newWords)); // new added terms are not in the source set for (String string : newWords) { assertFalse(set.contains(string)); } }
/** * @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is no longer * needed. */ @Deprecated public void testSingleHighSurrogateBWComapt() { String missing = "Term %s is missing in the set"; String falsePos = "Term %s is in the set but shouldn't"; String[] upperArr = new String[] {"ABC\uD800", "ABC\uD800EfG", "\uD800EfG", "\uD800\ud801\udc1cB"}; String[] lowerArr = new String[] {"abc\uD800", "abc\uD800efg", "\uD800efg", "\uD800\ud801\udc44b"}; CharArraySet set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), true); for (String upper : upperArr) { set.add(upper); } for (int i = 0; i < upperArr.length; i++) { assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i])); if (i == lowerArr.length - 1) assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i])); else assertTrue(String.format(Locale.ROOT, missing, lowerArr[i]), set.contains(lowerArr[i])); } set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), false); for (String upper : upperArr) { set.add(upper); } for (int i = 0; i < upperArr.length; i++) { assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i])); assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i])); } }
public void testToString() { CharArraySet set = CharArraySet.copy(TEST_VERSION_CURRENT, Collections.singleton("test")); assertEquals("[test]", set.toString()); set.add("test2"); assertTrue(set.toString().contains(", ")); set = CharArraySet.copy(Version.LUCENE_3_0, Collections.singleton("test")); assertEquals("[test]", set.toString()); set.add("test2"); assertTrue(set.toString().contains(", ")); }
public void testClear() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.addAll(Arrays.asList(TEST_STOP_WORDS)); assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size()); set.clear(); assertEquals("not empty", 0, set.size()); for (int i = 0; i < TEST_STOP_WORDS.length; i++) assertFalse(set.contains(TEST_STOP_WORDS[i])); set.addAll(Arrays.asList(TEST_STOP_WORDS)); assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size()); for (int i = 0; i < TEST_STOP_WORDS.length; i++) assertTrue(set.contains(TEST_STOP_WORDS[i])); }
/** * Returns as {@link CharArraySet} from wordFiles, which can be a comma-separated list of * filenames */ protected final CharArraySet getWordSet( ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); } } return words; }
/** * @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is no longer * needed. */ @Deprecated public void testSupplementaryCharsBWCompat() { String missing = "Term %s is missing in the set"; String falsePos = "Term %s is in the set but shouldn't"; // for reference see // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on String[] upperArr = new String[] {"Abc\ud801\udc1c", "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"}; String[] lowerArr = new String[] {"abc\ud801\udc44", "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"}; CharArraySet set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), true); for (String upper : upperArr) { set.add(upper); } for (int i = 0; i < upperArr.length; i++) { assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i])); assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i])); } set = new CharArraySet(Version.LUCENE_3_0, Arrays.asList(TEST_STOP_WORDS), false); for (String upper : upperArr) { set.add(upper); } for (int i = 0; i < upperArr.length; i++) { assertTrue(String.format(Locale.ROOT, missing, upperArr[i]), set.contains(upperArr[i])); assertFalse(String.format(Locale.ROOT, falsePos, lowerArr[i]), set.contains(lowerArr[i])); } }
/** Test for NPE */ public void testContainsWithNull() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); try { set.contains((char[]) null, 0, 10); fail("null value must raise NPE"); } catch (NullPointerException e) { } try { set.contains((CharSequence) null); fail("null value must raise NPE"); } catch (NullPointerException e) { } try { set.contains((Object) null); fail("null value must raise NPE"); } catch (NullPointerException e) { } }
@Override public void init(Map<String, String> args) { super.init(args); String k = args.get(KEEP); if (k != null) { StringTokenizer st = new StringTokenizer(k); boolean ignoreCase = false; String ignoreStr = args.get(KEEP_IGNORE_CASE); if ("true".equalsIgnoreCase(ignoreStr)) { ignoreCase = true; } keep = new CharArraySet(10, ignoreCase); while (st.hasMoreTokens()) { k = st.nextToken().trim(); keep.add(k.toCharArray()); } } k = args.get(OK_PREFIX); if (k != null) { okPrefix = new ArrayList<char[]>(); StringTokenizer st = new StringTokenizer(k); while (st.hasMoreTokens()) { okPrefix.add(st.nextToken().trim().toCharArray()); } } k = args.get(MIN_WORD_LENGTH); if (k != null) { minWordLength = Integer.valueOf(k); } k = args.get(MAX_WORD_COUNT); if (k != null) { maxWordCount = Integer.valueOf(k); } k = args.get(MAX_TOKEN_LENGTH); if (k != null) { maxTokenLength = Integer.valueOf(k); } k = args.get(ONLY_FIRST_WORD); if (k != null) { onlyFirstWord = Boolean.valueOf(k); } k = args.get(FORCE_FIRST_LETTER); if (k != null) { forceFirstLetter = Boolean.valueOf(k); } }
public void testNonZeroOffset() { String[] words = {"Hello", "World", "this", "is", "a", "test"}; char[] findme = "xthisy".toCharArray(); CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.addAll(Arrays.asList(words)); assertTrue(set.contains(findme, 1, 4)); assertTrue(set.contains(new String(findme, 1, 4))); // test unmodifiable set = CharArraySet.unmodifiableSet(set); assertTrue(set.contains(findme, 1, 4)); assertTrue(set.contains(new String(findme, 1, 4))); }
public void processWord(char[] buffer, int offset, int length, int wordCount) { if (length < 1) { return; } if (onlyFirstWord && wordCount > 0) { for (int i = 0; i < length; i++) { buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); } return; } if (keep != null && keep.contains(buffer, offset, length)) { if (wordCount == 0 && forceFirstLetter) { buffer[offset] = Character.toUpperCase(buffer[offset]); } return; } if (length < minWordLength) { return; } for (char[] prefix : okPrefix) { if (length >= prefix.length) { // don't bother checking if the buffer length is less than the prefix boolean match = true; for (int i = 0; i < prefix.length; i++) { if (prefix[i] != buffer[offset + i]) { match = false; break; } } if (match == true) { return; } } } // We know it has at least one character /*char[] chars = w.toCharArray(); StringBuilder word = new StringBuilder( w.length() ); word.append( Character.toUpperCase( chars[0] ) );*/ buffer[offset] = Character.toUpperCase(buffer[offset]); for (int i = 1; i < length; i++) { buffer[offset + i] = Character.toLowerCase(buffer[offset + i]); } // return word.toString(); }
public void testObjectContains() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); Integer val = Integer.valueOf(1); set.add(val); assertTrue(set.contains(val)); assertTrue(set.contains(new Integer(1))); // another integer assertTrue(set.contains("1")); assertTrue(set.contains(new char[] {'1'})); // test unmodifiable set = CharArraySet.unmodifiableSet(set); assertTrue(set.contains(val)); assertTrue(set.contains(new Integer(1))); // another integer assertTrue(set.contains("1")); assertTrue(set.contains(new char[] {'1'})); }
/** * Tests a special case of {@link CharArraySet#copy(Version, Set)} where the set to copy is the * {@link CharArraySet#EMPTY_SET} */ public void testCopyEmptySet() { assertSame( CharArraySet.EMPTY_SET, CharArraySet.copy(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET)); }
public void testRehash() throws Exception { CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 0, true); for (int i = 0; i < TEST_STOP_WORDS.length; i++) cas.add(TEST_STOP_WORDS[i]); assertEquals(TEST_STOP_WORDS.length, cas.size()); for (int i = 0; i < TEST_STOP_WORDS.length; i++) assertTrue(cas.contains(TEST_STOP_WORDS[i])); }
public void testUnmodifiableSet() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.addAll(Arrays.asList(TEST_STOP_WORDS)); set.add(Integer.valueOf(1)); final int size = set.size(); set = CharArraySet.unmodifiableSet(set); assertEquals("Set size changed due to unmodifiableSet call", size, set.size()); for (String stopword : TEST_STOP_WORDS) { assertTrue(set.contains(stopword)); } assertTrue(set.contains(Integer.valueOf(1))); assertTrue(set.contains("1")); assertTrue(set.contains(new char[] {'1'})); try { CharArraySet.unmodifiableSet(null); fail("can not make null unmodifiable"); } catch (NullPointerException e) { // expected } }
/** * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link LowerCaseFilter} and {@link * StopFilter}, using a list of English stop words. * * <p><a name="version"/> * * <p>You must specify the required {@link Version} compatibility when creating StandardAnalyzer: * * <ul> * <li>As of 3.4, Hiragana and Han characters are no longer wrongly split from their combining * characters. If you use a previous version number, you get the exact broken behavior for * backwards compatibility. * <li>As of 3.1, StandardTokenizer implements Unicode text segmentation, and StopFilter correctly * handles Unicode 4.0 supplementary characters in stopwords. {@link ClassicTokenizer} and * {@link ClassicAnalyzer} are the pre-3.1 implementations of StandardTokenizer and * StandardAnalyzer. * <li>As of 2.9, StopFilter preserves position increments * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see <a * href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>) * </ul> */ public final class PhaidraAnalyzer extends StopwordAnalyzerBase { /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * Specifies whether deprecated acronyms should be replaced with HOST type. See {@linkplain * "https://issues.apache.org/jira/browse/LUCENE-1068"} */ private final boolean replaceInvalidAcronym; /** * An unmodifiable set containing some common English words that are usually not useful for * searching. */ public static final Set<?> STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; /** * Builds an analyzer with the given stop words. * * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>} * @param stopWords stop words */ public PhaidraAnalyzer(Version matchVersion, Set<?> stopWords) { super(matchVersion, stopWords); replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_33); } /** * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). * * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>} */ public PhaidraAnalyzer(Version matchVersion) { this(matchVersion, STOP_WORDS_SET); } /** * Builds an analyzer with the stop words from the given file. * * @see WordlistLoader#getWordSet(Reader, Version) * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>} * @param stopwords File to read stop words from */ public PhaidraAnalyzer(Version matchVersion, File stopwords) throws IOException { this( matchVersion, WordlistLoader.getWordSet( IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion)); } /** * Builds an analyzer with the stop words from the given reader. * * @see WordlistLoader#getWordSet(Reader, Version) * @param matchVersion Lucene version to match See {@link <a href="#version">above</a>} * @param stopwords Reader to read stop words from */ public PhaidraAnalyzer(Version matchVersion, Reader stopwords) throws IOException { this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion)); } /** * Set maximum allowed token length. If a token is seen that exceeds this length then it is * discarded. This setting only takes effect the next time tokenStream or reusableTokenStream is * called. */ public void setMaxTokenLength(int length) { maxTokenLength = length; } /** @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( new CharArraySet( Version.LUCENE_33, Arrays.asList( "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"), true)); @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(matchVersion, src); // unipd adding ElisionFilter for apostrophes tok = new ElisionFilter(matchVersion, tok, DEFAULT_ARTICLES); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); // rasta: adding ASCIIFoldingFilter to enable search for accent tok = new ASCIIFoldingFilter(tok); return new TokenStreamComponents(src, tok) { @Override protected boolean reset(final Reader reader) throws IOException { src.setMaxTokenLength(PhaidraAnalyzer.this.maxTokenLength); return super.reset(reader); } }; } }
public void testModifyOnUnmodifiable() { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true); set.addAll(Arrays.asList(TEST_STOP_WORDS)); final int size = set.size(); set = CharArraySet.unmodifiableSet(set); assertEquals("Set size changed due to unmodifiableSet call", size, set.size()); String NOT_IN_SET = "SirGallahad"; assertFalse("Test String already exists in set", set.contains(NOT_IN_SET)); try { set.add(NOT_IN_SET.toCharArray()); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(NOT_IN_SET); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add(new StringBuilder(NOT_IN_SET)); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.clear(); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.add((Object) NOT_IN_SET); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); assertEquals("Size of unmodifiable set has changed", size, set.size()); } // This test was changed in 3.1, as a contains() call on the given Collection using the // "correct" iterator's // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor // never call // remove() on the iterator try { set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), true)); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(NOT_IN_SET), true)); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertEquals("Size of unmodifiable set has changed", size, set.size()); } try { set.addAll(Arrays.asList(NOT_IN_SET)); fail("Modified unmodifiable set"); } catch (UnsupportedOperationException e) { // expected assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET)); } for (int i = 0; i < TEST_STOP_WORDS.length; i++) { assertTrue(set.contains(TEST_STOP_WORDS[i])); } }
/** Test the static #copy() function with a CharArraySet as a source */ public void testCopyCharArraySet() { CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true); CharArraySet setCaseSensitive = new CharArraySet(TEST_VERSION_CURRENT, 10, false); List<String> stopwords = Arrays.asList(TEST_STOP_WORDS); List<String> stopwordsUpper = new ArrayList<>(); for (String string : stopwords) { stopwordsUpper.add(string.toUpperCase(Locale.ROOT)); } setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS)); setIngoreCase.add(Integer.valueOf(1)); setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS)); setCaseSensitive.add(Integer.valueOf(1)); CharArraySet copy = CharArraySet.copy(TEST_VERSION_CURRENT, setIngoreCase); CharArraySet copyCaseSens = CharArraySet.copy(TEST_VERSION_CURRENT, setCaseSensitive); assertEquals(setIngoreCase.size(), copy.size()); assertEquals(setCaseSensitive.size(), copy.size()); assertTrue(copy.containsAll(stopwords)); assertTrue(copy.containsAll(stopwordsUpper)); assertTrue(copyCaseSens.containsAll(stopwords)); for (String string : stopwordsUpper) { assertFalse(copyCaseSens.contains(string)); } // test adding terms to the copy List<String> newWords = new ArrayList<>(); for (String string : stopwords) { newWords.add(string + "_1"); } copy.addAll(newWords); assertTrue(copy.containsAll(stopwords)); assertTrue(copy.containsAll(stopwordsUpper)); assertTrue(copy.containsAll(newWords)); // new added terms are not in the source set for (String string : newWords) { assertFalse(setIngoreCase.contains(string)); assertFalse(setCaseSensitive.contains(string)); } }