Ejemplo n.º 1
0
  @Test
  public void testStandardTokenizer() {
    LuceneAnalyzer analyzer1 =
        new LuceneAnalyzer()
            .setInputCol("rawText")
            .setOutputCol("tokens"); // Default analysis schema: StandardTokenizer + LowerCaseFilter

    assertExpectedTokens(
        analyzer1,
        Arrays.asList(
            new TokenizerTestData(
                "Test for tokenization.", new String[] {"test", "for", "tokenization"}),
            new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"})));

    assertExpectedTokens(
        analyzer1,
        Arrays.asList(
            new TokenizerTestData(
                "Test for tokenization.", new String[] {"test", "for", "tokenization"}),
            new TokenizerTestData("Te,st. punct", new String[] {"te", "st", "punct"})));

    String analysisSchema1 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'analyzers': [{\n"
                + "  'name': 'StdTok_max10',\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard',\n"
                + "    'maxTokenLength': '10'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'regex': '.+',\n"
                + "  'analyzer': 'StdTok_max10'\n"
                + "}]}\n");
    analyzer1.setAnalysisSchema(analysisSchema1);
    assertExpectedTokens(
        analyzer1,
        Arrays.asList(
            new TokenizerTestData(
                "我是中国人。 1234 Tests ", new String[] {"我", "是", "中", "国", "人", "1234", "Tests"}),
            new TokenizerTestData(
                "some-dashed-phrase", new String[] {"some", "dashed", "phrase"})));

    String analysisSchema2 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'defaultLuceneMatchVersion': '4.10.4',\n"
                + "'analyzers': [{\n"
                + "  'name': 'StdTok_max3',\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard',\n"
                + "    'maxTokenLength': '3'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'regex': '.+',\n"
                + "  'analyzer': 'StdTok_max3'\n"
                + "}]}\n");
    LuceneAnalyzer analyzer2 =
        new LuceneAnalyzer()
            .setAnalysisSchema(analysisSchema2)
            .setInputCol("rawText")
            .setOutputCol("tokens");
    assertExpectedTokens(
        analyzer2,
        Arrays.asList(
            new TokenizerTestData(
                "Test for tokenization.",
                new String[] {"Tes", "t", "for", "tok", "eni", "zat", "ion"}),
            new TokenizerTestData("Te,st.  punct", new String[] {"Te", "st", "pun", "ct"})));
  }
Ejemplo n.º 2
0
  @Test
  @SuppressWarnings("unchecked")
  public void testCharFilters() {
    String analysisSchema1 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'analyzers': [{\n"
                + "  'name': 'strip_alpha_std_tok',\n"
                + "  'charFilters': [{\n"
                + "    'type': 'patternreplace',\n"
                + "    'pattern': '[A-Za-z]+',\n"
                + "    'replacement': ''\n"
                + "  }],\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'regex': '.+',\n"
                + "  'analyzer': 'strip_alpha_std_tok'\n"
                + "}]}\n");
    LuceneAnalyzer analyzer =
        new LuceneAnalyzer()
            .setAnalysisSchema(analysisSchema1)
            .setInputCol("rawText")
            .setOutputCol("tokens");

    assertExpectedTokens(
        analyzer,
        Arrays.asList(
            new TokenizerTestData("Test for 9983, tokenization.", new String[] {"9983"}),
            new TokenizerTestData("Te,st. punct", new String[] {})));

    String analysisSchema2 =
        json(
            "{\n"
                + "'schemaType': 'LuceneAnalyzerSchema.v1',\n"
                + "'analyzers': [{\n"
                + "  'name': 'htmlstrip_drop_removeme_std_tok',\n"
                + "  'charFilters': [{\n"
                + "      'type': 'htmlstrip'\n"
                + "    }, {\n"
                + "      'type': 'patternreplace',\n"
                + "      'pattern': 'removeme',\n"
                + "      'replacement': ''\n"
                + "  }],\n"
                + "  'tokenizer': {\n"
                + "    'type': 'standard'\n"
                + "  }\n"
                + "}],\n"
                + "'inputColumns': [{\n"
                + "  'name': 'rawText',\n"
                + "  'analyzer': 'htmlstrip_drop_removeme_std_tok'\n"
                + "}]}\n");
    analyzer.setAnalysisSchema(analysisSchema2);

    assertExpectedTokens(
        analyzer,
        Collections.singletonList(
            new TokenizerTestData(
                "<html><body>remove<b>me</b> but leave<div>the&nbsp;rest.</div></body></html>",
                new String[] {"but", "leave", "the", "rest"})));
  }