private <T> void assertExpectedTokens(LuceneAnalyzer analyzer, List<T> testData) { JavaRDD<T> rdd = jsc.parallelize(testData); Row[] pairs = analyzer .transform(jsql.createDataFrame(rdd, testData.get(0).getClass())) .select("wantedTokens", "tokens") .collect(); for (Row r : pairs) { Assert.assertEquals(r.get(0), r.get(1)); } }
@Test public void testMissingValues() { LuceneAnalyzer analyzer = new LuceneAnalyzer().setInputCol("rawText").setOutputCol("tokens"); assertExpectedTokens(analyzer, Arrays.asList(new TokenizerTestData(null, new String[] {}))); assertExpectedTokens(analyzer, Arrays.asList(new TokenizerTestData("", new String[] {}))); assertExpectedTokens( analyzer, Collections.singletonList( new MV_TokenizerTestData( new String[] {null, "Harold's not around.", null, "The dog's nose KNOWS!", ""}, new String[] {"harold's", "not", "around", "the", "dog's", "nose", "knows"}))); analyzer.setInputCols(new String[] {"rawText1", "rawText2", "rawText3"}); assertExpectedTokens( analyzer, Collections.singletonList( new SV_SV_SV_TokenizerTestData( "", "The dog's nose KNOWS!", null, new String[] {"the", "dog's", "nose", "knows"}))); }
@Test public void testPrefixTokensWithInputCol() { String[] rawText1 = new String[] {"Harold's NOT around.", "Anymore, I mean."}; String[] tokens1 = new String[] {"harold's", "not", "around", "anymore", "i", "mean"}; String[] rawText2 = new String[] {"The dog's nose KNOWS!", "Good, fine, great..."}; String[] tokens2 = new String[] {"the", "dog's", "nose", "knows", "good", "fine", "great"}; List<String> tokenList = new ArrayList<>(); List<String> prefixedTokenList = new ArrayList<>(); for (String token : tokens1) { tokenList.add(token); prefixedTokenList.add("rawText1=" + token); } for (String token : tokens2) { tokenList.add(token); prefixedTokenList.add("rawText2=" + token); } String[] tokens = tokenList.toArray(new String[tokenList.size()]); String[] prefixedTokens = prefixedTokenList.toArray(new String[prefixedTokenList.size()]); // First transform without token prefixes LuceneAnalyzer analyzer = new LuceneAnalyzer() .setInputCols(new String[] {"rawText1", "rawText2"}) .setOutputCol("tokens"); assertExpectedTokens( analyzer, Collections.singletonList(new MV_MV_TokenizerTestData(rawText1, rawText2, tokens))); // Then transform with token prefixes analyzer.setPrefixTokensWithInputCol(true); assertExpectedTokens( analyzer, Collections.singletonList(new MV_MV_TokenizerTestData(rawText1, rawText2, prefixedTokens))); }
@Test public void testStandardTokenizer() { LuceneAnalyzer analyzer1 = new LuceneAnalyzer() .setInputCol("rawText") .setOutputCol("tokens"); // Default analysis schema: StandardTokenizer + LowerCaseFilter assertExpectedTokens( analyzer1, Arrays.asList( new TokenizerTestData( "Test for tokenization.", new String[] {"test", "for", "tokenization"}), new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"}))); assertExpectedTokens( analyzer1, Arrays.asList( new TokenizerTestData( "Test for tokenization.", new String[] {"test", "for", "tokenization"}), new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"}))); String analysisSchema1 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'analyzers': [{\n" + " 'name': 'StdTok_max10',\n" + " 'tokenizer': {\n" + " 'type': 'standard',\n" + " 'maxTokenLength': '10'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'regex': '.+',\n" + " 'analyzer': 'StdTok_max10'\n" + "}]}\n"); analyzer1.setAnalysisSchema(analysisSchema1); assertExpectedTokens( analyzer1, Arrays.asList( new TokenizerTestData( "我是中国人。 1234 Tests ", new String[] {"我", "是", "中", "国", "人", "1234", "Tests"}), new TokenizerTestData( "some-dashed-phrase", new String[] {"some", "dashed", "phrase"}))); String analysisSchema2 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'defaultLuceneMatchVersion': '4.10.4',\n" + "'analyzers': [{\n" + " 'name': 'StdTok_max3',\n" + " 'tokenizer': {\n" + " 'type': 'standard',\n" + " 'maxTokenLength': '3'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'regex': '.+',\n" + " 'analyzer': 'StdTok_max3'\n" + "}]}\n"); LuceneAnalyzer analyzer2 = new LuceneAnalyzer() .setAnalysisSchema(analysisSchema2) .setInputCol("rawText") .setOutputCol("tokens"); assertExpectedTokens( analyzer2, Arrays.asList( new TokenizerTestData( "Test for tokenization.", new String[] {"Tes", "t", "for", "tok", "eni", "zat", "ion"}), new TokenizerTestData("Te,st. punct", new String[] {"Te", "st", "pun", "ct"}))); }
@Test public void testMultipleInputCols() { LuceneAnalyzer analyzer1 = new LuceneAnalyzer() .setInputCols(new String[] {"rawText1", "rawText2"}) .setOutputCol("tokens"); assertExpectedTokens( analyzer1, Collections.singletonList( new SV_SV_TokenizerTestData( "Harold's not around.", "The dog's nose KNOWS!", new String[] {"harold's", "not", "around", "the", "dog's", "nose", "knows"}))); String analysisSchema = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'analyzers': [{\n" + " 'name': 'std_tok_lower',\n" + " 'tokenizer': { 'type': 'standard' },\n" + " 'filters':[{ 'type': 'lowercase' }]\n" + " }, {\n" + " 'name': 'std_tok',\n" + " 'tokenizer': { 'type': 'standard' }\n" + " }, {\n" + " 'name': 'htmlstrip_std_tok_lower',\n" + " 'charFilters': [{ 'type': 'htmlstrip' }],\n" + " 'tokenizer': { 'type': 'standard' },\n" + " 'filters': [{ 'type': 'lowercase' }]\n" + "}],\n" + "'inputColumns': [{\n" + " 'name': 'rawText1',\n" + " 'analyzer': 'std_tok_lower'\n" + " }, {\n" + " 'name': 'rawText2',\n" + " 'analyzer': 'std_tok'\n" + " }, {\n" + " 'regex': '.+',\n" + " 'analyzer': 'htmlstrip_std_tok_lower'\n" + "}]}\n"); LuceneAnalyzer analyzer2 = new LuceneAnalyzer() .setAnalysisSchema(analysisSchema) .setInputCols(new String[] {"rawText1", "rawText2"}) .setOutputCol("tokens"); assertExpectedTokens( analyzer2, Collections.singletonList( new SV_SV_TokenizerTestData( "Harold's NOT around.", "The dog's nose KNOWS!", new String[] {"harold's", "not", "around", "The", "dog's", "nose", "KNOWS"}))); assertExpectedTokens( analyzer2, Collections.singletonList( new SV_MV_TokenizerTestData( "Harold's NOT around.", new String[] {"The dog's nose KNOWS!", "Good, fine, great..."}, new String[] { "harold's", "not", "around", "The", "dog's", "nose", "KNOWS", "Good", "fine", "great" }))); assertExpectedTokens( analyzer2, Collections.singletonList( new MV_MV_TokenizerTestData( new String[] {"Harold's NOT around.", "Anymore, I mean."}, new String[] {"The dog's nose KNOWS!", "Good, fine, great..."}, new String[] { "harold's", "not", "around", "anymore", "i", "mean", "The", "dog's", "nose", "KNOWS", "Good", "fine", "great" }))); analyzer2.setInputCols(new String[] {"rawText1", "rawText2", "rawText3"}); assertExpectedTokens( analyzer2, Collections.singletonList( new SV_SV_SV_TokenizerTestData( "Harold's NOT around.", "The dog's nose KNOWS!", "<html><body>Content</body></html>", new String[] { "harold's", "not", "around", "The", "dog's", "nose", "KNOWS", "content" }))); }
@Test @SuppressWarnings("unchecked") public void testCharFilters() { String analysisSchema1 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'analyzers': [{\n" + " 'name': 'strip_alpha_std_tok',\n" + " 'charFilters': [{\n" + " 'type': 'patternreplace',\n" + " 'pattern': '[A-Za-z]+',\n" + " 'replacement': ''\n" + " }],\n" + " 'tokenizer': {\n" + " 'type': 'standard'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'regex': '.+',\n" + " 'analyzer': 'strip_alpha_std_tok'\n" + "}]}\n"); LuceneAnalyzer analyzer = new LuceneAnalyzer() .setAnalysisSchema(analysisSchema1) .setInputCol("rawText") .setOutputCol("tokens"); assertExpectedTokens( analyzer, Arrays.asList( new TokenizerTestData("Test for 9983, tokenization.", new String[] {"9983"}), new TokenizerTestData("Te,st. punct", new String[] {}))); String analysisSchema2 = json( "{\n" + "'schemaType': 'LuceneAnalyzerSchema.v1',\n" + "'analyzers': [{\n" + " 'name': 'htmlstrip_drop_removeme_std_tok',\n" + " 'charFilters': [{\n" + " 'type': 'htmlstrip'\n" + " }, {\n" + " 'type': 'patternreplace',\n" + " 'pattern': 'removeme',\n" + " 'replacement': ''\n" + " }],\n" + " 'tokenizer': {\n" + " 'type': 'standard'\n" + " }\n" + "}],\n" + "'inputColumns': [{\n" + " 'name': 'rawText',\n" + " 'analyzer': 'htmlstrip_drop_removeme_std_tok'\n" + "}]}\n"); analyzer.setAnalysisSchema(analysisSchema2); assertExpectedTokens( analyzer, Collections.singletonList( new TokenizerTestData( "<html><body>remove<b>me</b> but leave<div>the rest.</div></body></html>", new String[] {"but", "leave", "the", "rest"}))); }