private <T> void assertExpectedTokens(LuceneAnalyzer analyzer, List<T> testData) {
   JavaRDD<T> rdd = jsc.parallelize(testData);
   Row[] pairs =
       analyzer
           .transform(jsql.createDataFrame(rdd, testData.get(0).getClass()))
           .select("wantedTokens", "tokens")
           .collect();
   for (Row r : pairs) {
     Assert.assertEquals(r.get(0), r.get(1));
   }
 }
  @Test
  public void testMissingValues() {
    LuceneAnalyzer analyzer = new LuceneAnalyzer().setInputCol("rawText").setOutputCol("tokens");
    assertExpectedTokens(analyzer, Arrays.asList(new TokenizerTestData(null, new String[] {})));
    assertExpectedTokens(analyzer, Arrays.asList(new TokenizerTestData("", new String[] {})));
    assertExpectedTokens(
        analyzer,
        Collections.singletonList(
            new MV_TokenizerTestData(
                new String[] {null, "Harold's not around.", null, "The dog's nose KNOWS!", ""},
                new String[] {"harold's", "not", "around", "the", "dog's", "nose", "knows"})));

    analyzer.setInputCols(new String[] {"rawText1", "rawText2", "rawText3"});
    assertExpectedTokens(
        analyzer,
        Collections.singletonList(
            new SV_SV_SV_TokenizerTestData(
                "",
                "The dog's nose KNOWS!",
                null,
                new String[] {"the", "dog's", "nose", "knows"})));
  }
  @Test
  public void testPrefixTokensWithInputCol() {
    String[] rawText1 = new String[] {"Harold's NOT around.", "Anymore, I mean."};
    String[] tokens1 = new String[] {"harold's", "not", "around", "anymore", "i", "mean"};

    String[] rawText2 = new String[] {"The dog's nose KNOWS!", "Good, fine, great..."};
    String[] tokens2 = new String[] {"the", "dog's", "nose", "knows", "good", "fine", "great"};

    List<String> tokenList = new ArrayList<>();
    List<String> prefixedTokenList = new ArrayList<>();
    for (String token : tokens1) {
      tokenList.add(token);
      prefixedTokenList.add("rawText1=" + token);
    }
    for (String token : tokens2) {
      tokenList.add(token);
      prefixedTokenList.add("rawText2=" + token);
    }
    String[] tokens = tokenList.toArray(new String[tokenList.size()]);
    String[] prefixedTokens = prefixedTokenList.toArray(new String[prefixedTokenList.size()]);

    // First transform without token prefixes
    LuceneAnalyzer analyzer =
        new LuceneAnalyzer()
            .setInputCols(new String[] {"rawText1", "rawText2"})
            .setOutputCol("tokens");
    assertExpectedTokens(
        analyzer,
        Collections.singletonList(new MV_MV_TokenizerTestData(rawText1, rawText2, tokens)));

    // Then transform with token prefixes
    analyzer.setPrefixTokensWithInputCol(true);
    assertExpectedTokens(
        analyzer,
        Collections.singletonList(new MV_MV_TokenizerTestData(rawText1, rawText2, prefixedTokens)));
  }
  @Test
  public void testStandardTokenizer() {
    LuceneAnalyzer analyzer1 =
        new LuceneAnalyzer()
            .setInputCol("rawText")
            .setOutputCol("tokens"); // Default analysis schema: StandardTokenizer + LowerCaseFilter

    assertExpectedTokens(
        analyzer1,
        Arrays.asList(
            new TokenizerTestData(
                "Test for tokenization.", new String[] {"test", "for", "tokenization"}),
            new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"})));

    assertExpectedTokens(
        analyzer1,
        Arrays.asList(
            new TokenizerTestData(
                "Test for tokenization.", new String[] {"test", "for", "tokenization"}),
            new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"})));

    String analysisSchema1 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'analyzers': [{\n"
                + "  'name': 'StdTok_max10',\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard',\n"
                + "    'maxTokenLength': '10'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'regex': '.+',\n"
                + "  'analyzer': 'StdTok_max10'\n"
                + "}]}\n");
    analyzer1.setAnalysisSchema(analysisSchema1);
    assertExpectedTokens(
        analyzer1,
        Arrays.asList(
            new TokenizerTestData(
                "我是中国人。 1234 Tests ", new String[] {"我", "是", "中", "国", "人", "1234", "Tests"}),
            new TokenizerTestData(
                "some-dashed-phrase", new String[] {"some", "dashed", "phrase"})));

    String analysisSchema2 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'defaultLuceneMatchVersion': '4.10.4',\n"
                + "'analyzers': [{\n"
                + "  'name': 'StdTok_max3',\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard',\n"
                + "    'maxTokenLength': '3'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'regex': '.+',\n"
                + "  'analyzer': 'StdTok_max3'\n"
                + "}]}\n");
    LuceneAnalyzer analyzer2 =
        new LuceneAnalyzer()
            .setAnalysisSchema(analysisSchema2)
            .setInputCol("rawText")
            .setOutputCol("tokens");
    assertExpectedTokens(
        analyzer2,
        Arrays.asList(
            new TokenizerTestData(
                "Test for tokenization.",
                new String[] {"Tes", "t", "for", "tok", "eni", "zat", "ion"}),
            new TokenizerTestData("Te,st.  punct", new String[] {"Te", "st", "pun", "ct"})));
  }
  @Test
  public void testMultipleInputCols() {
    LuceneAnalyzer analyzer1 =
        new LuceneAnalyzer()
            .setInputCols(new String[] {"rawText1", "rawText2"})
            .setOutputCol("tokens");
    assertExpectedTokens(
        analyzer1,
        Collections.singletonList(
            new SV_SV_TokenizerTestData(
                "Harold's not around.",
                "The dog's nose KNOWS!",
                new String[] {"harold's", "not", "around", "the", "dog's", "nose", "knows"})));

    String analysisSchema =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'analyzers': [{\n"
                + "    'name': 'std_tok_lower',\n"
                + "    'tokenizer': { 'type': 'standard' },\n"
                + "    'filters':[{ 'type': 'lowercase' }]\n"
                + "  }, {\n"
                + "    'name': 'std_tok',\n"
                + "    'tokenizer': { 'type': 'standard' }\n"
                + "  }, {\n"
                + "    'name': 'htmlstrip_std_tok_lower',\n"
                + "    'charFilters': [{ 'type': 'htmlstrip' }],\n"
                + "    'tokenizer': { 'type': 'standard' },\n"
                + "    'filters': [{ 'type': 'lowercase' }]\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "    'name': 'rawText1',\n"
                + "    'analyzer': 'std_tok_lower'\n"
                + "  }, {\n"
                + "    'name': 'rawText2',\n"
                + "    'analyzer': 'std_tok'\n"
                + "  }, {\n"
                + "    'regex': '.+',\n"
                + "    'analyzer': 'htmlstrip_std_tok_lower'\n"
                + "}]}\n");
    LuceneAnalyzer analyzer2 =
        new LuceneAnalyzer()
            .setAnalysisSchema(analysisSchema)
            .setInputCols(new String[] {"rawText1", "rawText2"})
            .setOutputCol("tokens");
    assertExpectedTokens(
        analyzer2,
        Collections.singletonList(
            new SV_SV_TokenizerTestData(
                "Harold's NOT around.",
                "The dog's nose KNOWS!",
                new String[] {"harold's", "not", "around", "The", "dog's", "nose", "KNOWS"})));

    assertExpectedTokens(
        analyzer2,
        Collections.singletonList(
            new SV_MV_TokenizerTestData(
                "Harold's NOT around.",
                new String[] {"The dog's nose KNOWS!", "Good, fine, great..."},
                new String[] {
                  "harold's",
                  "not",
                  "around",
                  "The",
                  "dog's",
                  "nose",
                  "KNOWS",
                  "Good",
                  "fine",
                  "great"
                })));

    assertExpectedTokens(
        analyzer2,
        Collections.singletonList(
            new MV_MV_TokenizerTestData(
                new String[] {"Harold's NOT around.", "Anymore, I mean."},
                new String[] {"The dog's nose KNOWS!", "Good, fine, great..."},
                new String[] {
                  "harold's",
                  "not",
                  "around",
                  "anymore",
                  "i",
                  "mean",
                  "The",
                  "dog's",
                  "nose",
                  "KNOWS",
                  "Good",
                  "fine",
                  "great"
                })));

    analyzer2.setInputCols(new String[] {"rawText1", "rawText2", "rawText3"});
    assertExpectedTokens(
        analyzer2,
        Collections.singletonList(
            new SV_SV_SV_TokenizerTestData(
                "Harold's NOT around.",
                "The dog's nose KNOWS!",
                "<html><body>Content</body></html>",
                new String[] {
                  "harold's", "not", "around", "The", "dog's", "nose", "KNOWS", "content"
                })));
  }
  @Test
  @SuppressWarnings("unchecked")
  public void testCharFilters() {
    String analysisSchema1 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'analyzers': [{\n"
                + "  'name': 'strip_alpha_std_tok',\n"
                + "  'charFilters': [{\n"
                + "    'type': 'patternreplace',\n"
                + "    'pattern': '[A-Za-z]+',\n"
                + "    'replacement': ''\n"
                + "  }],\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'regex': '.+',\n"
                + "  'analyzer': 'strip_alpha_std_tok'\n"
                + "}]}\n");
    LuceneAnalyzer analyzer =
        new LuceneAnalyzer()
            .setAnalysisSchema(analysisSchema1)
            .setInputCol("rawText")
            .setOutputCol("tokens");

    assertExpectedTokens(
        analyzer,
        Arrays.asList(
            new TokenizerTestData("Test for 9983, tokenization.", new String[] {"9983"}),
            new TokenizerTestData("Te,st. punct", new String[] {})));

    String analysisSchema2 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'analyzers': [{\n"
                + "  'name': 'htmlstrip_drop_removeme_std_tok',\n"
                + "  'charFilters': [{\n"
                + "      'type': 'htmlstrip'\n"
                + "    }, {\n"
                + "      'type': 'patternreplace',\n"
                + "      'pattern': 'removeme',\n"
                + "      'replacement': ''\n"
                + "  }],\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'name': 'rawText',\n"
                + "  'analyzer': 'htmlstrip_drop_removeme_std_tok'\n"
                + "}]}\n");
    analyzer.setAnalysisSchema(analysisSchema2);

    assertExpectedTokens(
        analyzer,
        Collections.singletonList(
            new TokenizerTestData(
                "<html><body>remove<b>me</b> but leave<div>the&nbsp;rest.</div></body></html>",
                new String[] {"but", "leave", "the", "rest"})));
  }