public void test() throws Exception { final CharArraySet cas = new CharArraySet(3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("mtqlpi", ""); builder.add("mwoknt", "jjp"); builder.add("tcgyreo", "zpfpajyws"); final NormalizeCharMap map = builder.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65); TokenFilter f = new CommonGramsFilter(t, cas); return new TokenStreamComponents(t, f); } @Override protected Reader initReader(String fieldName, Reader reader) { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader); return reader; } }; checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj"); a.close(); }
@Test public void testOffsetCorrection() throws Exception { final String INPUT = "Günther Günther is here"; // create MappingCharFilter List<String> mappingRules = new ArrayList<>(); mappingRules.add("\"ü\" => \"ü\""); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("ü", "ü"); NormalizeCharMap normMap = builder.build(); CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); // create PatternTokenizer Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1); stream.setReader(charStream); assertTokenStreamContents( stream, new String[] {"Günther", "Günther", "is", "here"}, new int[] {0, 13, 26, 29}, new int[] {12, 25, 28, 33}, INPUT.length()); charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0); stream.setReader(charStream); assertTokenStreamContents( stream, new String[] {"Günther", "Günther"}, new int[] {0, 13}, new int[] {12, 25}, INPUT.length()); }