@Test public void testStandardTokenizer() { LuceneAnalyzer analyzer1 = new LuceneAnalyzer() .setInputCol("rawText") .setOutputCol("tokens"); // Default analysis schema: StandardTokenizer + LowerCaseFilter assertExpectedTokens( analyzer1, Arrays.asList( new TokenizerTestData( "Test for tokenization.", new String[] {"test", "for", "tokenization"}), new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"}))); assertExpectedTokens( analyzer1, Arrays.asList( new TokenizerTestData( "Test for tokenization.", new String[] {"test", "for", "tokenization"}), new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"}))); String analysisSchema1 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'analyzers': [{\n" + " 'name': 'StdTok_max10',\n" + " 'tokenizer': {\n" + " 'type': 'standard',\n" + " 'maxTokenLength': '10'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'regex': '.+',\n" + " 'analyzer': 'StdTok_max10'\n" + "}]}\n"); analyzer1.setAnalysisSchema(analysisSchema1); assertExpectedTokens( analyzer1, Arrays.asList( new TokenizerTestData( "我是中国人。 1234 Tests ", new String[] {"我", "是", "中", "国", "人", "1234", "Tests"}), new TokenizerTestData( "some-dashed-phrase", new String[] {"some", "dashed", "phrase"}))); String analysisSchema2 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'defaultLuceneMatchVersion': '4.10.4',\n" + "'analyzers': [{\n" + " 'name': 'StdTok_max3',\n" + " 'tokenizer': {\n" + " 'type': 'standard',\n" + " 'maxTokenLength': '3'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'regex': '.+',\n" + " 'analyzer': 'StdTok_max3'\n" + "}]}\n"); LuceneAnalyzer analyzer2 = new LuceneAnalyzer() .setAnalysisSchema(analysisSchema2) .setInputCol("rawText") .setOutputCol("tokens"); assertExpectedTokens( analyzer2, Arrays.asList( new TokenizerTestData( "Test for tokenization.", new String[] {"Tes", "t", "for", "tok", "eni", "zat", "ion"}), new TokenizerTestData("Te,st. punct", new String[] {"Te", "st", "pun", "ct"}))); }
@Test @SuppressWarnings("unchecked") public void testCharFilters() { String analysisSchema1 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'analyzers': [{\n" + " 'name': 'strip_alpha_std_tok',\n" + " 'charFilters': [{\n" + " 'type': 'patternreplace',\n" + " 'pattern': '[A-Za-z]+',\n" + " 'replacement': ''\n" + " }],\n" + " 'tokenizer': {\n" + " 'type': 'standard'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'regex': '.+',\n" + " 'analyzer': 'strip_alpha_std_tok'\n" + "}]}\n"); LuceneAnalyzer analyzer = new LuceneAnalyzer() .setAnalysisSchema(analysisSchema1) .setInputCol("rawText") .setOutputCol("tokens"); assertExpectedTokens( analyzer, Arrays.asList( new TokenizerTestData("Test for 9983, tokenization.", new String[] {"9983"}), new TokenizerTestData("Te,st. punct", new String[] {}))); String analysisSchema2 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'analyzers': [{\n" + " 'name': 'htmlstrip_drop_removeme_std_tok',\n" + " 'charFilters': [{\n" + " 'type': 'htmlstrip'\n" + " }, {\n" + " 'type': 'patternreplace',\n" + " 'pattern': 'removeme',\n" + " 'replacement': ''\n" + " }],\n" + " 'tokenizer': {\n" + " 'type': 'standard'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'name': 'rawText',\n" + " 'analyzer': 'htmlstrip_drop_removeme_std_tok'\n" + "}]}\n"); analyzer.setAnalysisSchema(analysisSchema2); assertExpectedTokens( analyzer, Collections.singletonList( new TokenizerTestData( "<html><body>remove<b>me</b> but leave<div>the rest.</div></body></html>", new String[] {"but", "leave", "the", "rest"}))); }