public void testHugeDoc() throws IOException {
   StringBuilder sb = new StringBuilder();
   char whitespace[] = new char[4094];
   Arrays.fill(whitespace, ' ');
   sb.append(whitespace);
   sb.append("testing 1234");
   String input = sb.toString();
   StandardTokenizer tokenizer = new StandardTokenizer();
   tokenizer.setReader(new StringReader(input));
   BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] {"testing", "1234"});
 }
 @Override
 public Tokenizer create() {
   if (version.onOrAfter(Version.LUCENE_4_7_0)) {
     StandardTokenizer tokenizer = new StandardTokenizer();
     tokenizer.setMaxTokenLength(maxTokenLength);
     return tokenizer;
   } else {
     StandardTokenizer40 tokenizer = new StandardTokenizer40();
     tokenizer.setMaxTokenLength(maxTokenLength);
     return tokenizer;
   }
 }
Exemplo n.º 3
0
  @Override
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {

    final StandardTokenizer src = new StandardTokenizer(version, reader);
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tok = new StandardFilter(version, src);

    // No distinguimos mayúsculas
    tok = new LowerCaseFilter(version, tok);
    // No procesamos las stopwords
    tok = new StopFilter(version, tok, stopwords);
    // No distinguimos acentos
    tok = new ASCIIFoldingFilter(tok);

    return new TokenStreamComponents(src, tok);
  }
Exemplo n.º 4
0
 @Override
 protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
   final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
   src.setMaxTokenLength(maxTokenLength);
   TokenStream tok = new StandardFilter(matchVersion, src);
   // unipd adding ElisionFilter for apostrophes
   tok = new ElisionFilter(matchVersion, tok, DEFAULT_ARTICLES);
   tok = new LowerCaseFilter(matchVersion, tok);
   tok = new StopFilter(matchVersion, tok, stopwords);
   // rasta: adding ASCIIFoldingFilter to enable search for accent
   tok = new ASCIIFoldingFilter(tok);
   return new TokenStreamComponents(src, tok) {
     @Override
     protected boolean reset(final Reader reader) throws IOException {
       src.setMaxTokenLength(PhaidraAnalyzer.this.maxTokenLength);
       return super.reset(reader);
     }
   };
 }
 @Override
 public Tokenizer create(Reader reader) {
   StandardTokenizer tokenizer = new StandardTokenizer(version, reader);
   tokenizer.setMaxTokenLength(maxTokenLength);
   return tokenizer;
 }
  // LUCENE-5897: slow tokenization of strings of the form
  // (\p{WB:ExtendNumLet}[\p{WB:Format}\p{WB:Extend}]*)+
  @Slow
  public void testLargePartiallyMatchingToken() throws Exception {
    // TODO: get these lists of chars matching a property from ICU4J
    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
    char[] WordBreak_ExtendNumLet_chars =
        "_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f".toCharArray();

    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
    int[] WordBreak_Format_chars // only the first char in ranges
        = {
      0xAD, 0x600, 0x61C, 0x6DD, 0x70F, 0x180E, 0x200E, 0x202A, 0x2060, 0x2066, 0xFEFF, 0xFFF9,
      0x110BD, 0x1D173, 0xE0001, 0xE0020
    };

    // http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
    int[] WordBreak_Extend_chars // only the first char in ranges
        = {
      0x300, 0x483, 0x591, 0x5bf, 0x5c1, 0x5c4, 0x5c7, 0x610, 0x64b, 0x670, 0x6d6, 0x6df, 0x6e7,
      0x6ea, 0x711, 0x730, 0x7a6, 0x7eb, 0x816, 0x81b, 0x825, 0x829, 0x859, 0x8e4, 0x900, 0x93a,
      0x93e, 0x951, 0x962, 0x981, 0x9bc, 0x9be, 0x9c7, 0x9cb, 0x9d7, 0x9e2, 0xa01, 0xa3c, 0xa3e,
      0xa47, 0xa4b, 0xa51, 0xa70, 0xa75, 0xa81, 0xabc, 0xabe, 0xac7, 0xacb, 0xae2, 0xb01, 0xb3c,
      0xb3e, 0xb47, 0xb4b, 0xb56, 0xb62, 0xb82, 0xbbe, 0xbc6, 0xbca, 0xbd7, 0xc01, 0xc3e, 0xc46,
      0xc4a, 0xc55, 0xc62, 0xc82, 0xcbc, 0xcbe, 0xcc6, 0xcca, 0xcd5, 0xce2, 0xd02, 0xd3e, 0xd46,
      0xd4a, 0xd57, 0xd62, 0xd82, 0xdca, 0xdcf, 0xdd6, 0xdd8, 0xdf2, 0xe31, 0xe34, 0xe47, 0xeb1,
      0xeb4, 0xebb, 0xec8, 0xf18, 0xf35, 0xf37, 0xf39, 0xf3e, 0xf71, 0xf86, 0xf8d, 0xf99, 0xfc6,
      0x102b, 0x1056, 0x105e, 0x1062, 0x1067, 0x1071, 0x1082, 0x108f, 0x109a, 0x135d, 0x1712,
      0x1732, 0x1752, 0x1772, 0x17b4, 0x17dd, 0x180b, 0x18a9, 0x1920, 0x1930, 0x19b0, 0x19c8,
      0x1a17, 0x1a55, 0x1a60, 0x1a7f, 0x1b00, 0x1b34, 0x1b6b, 0x1b80, 0x1ba1, 0x1be6, 0x1c24,
      0x1cd0, 0x1cd4, 0x1ced, 0x1cf2, 0x1dc0, 0x1dfc, 0x200c, 0x20d0, 0x2cef, 0x2d7f, 0x2de0,
      0x302a, 0x3099, 0xa66f, 0xa674, 0xa69f, 0xa6f0, 0xa802, 0xa806, 0xa80b, 0xa823, 0xa880,
      0xa8b4, 0xa8e0, 0xa926, 0xa947, 0xa980, 0xa9b3, 0xaa29, 0xaa43, 0xaa4c, 0xaa7b, 0xaab0,
      0xaab2, 0xaab7, 0xaabe, 0xaac1, 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xfb1e, 0xfe00, 0xfe20,
      0xff9e, 0x101fd, 0x10a01, 0x10a05, 0x10a0C, 0x10a38, 0x10a3F, 0x11000, 0x11001, 0x11038,
      0x11080, 0x11082, 0x110b0, 0x110b3, 0x110b7, 0x110b9, 0x11100, 0x11127, 0x1112c, 0x11180,
      0x11182, 0x111b3, 0x111b6, 0x111bF, 0x116ab, 0x116ac, 0x116b0, 0x116b6, 0x16f51, 0x16f8f,
      0x1d165, 0x1d167, 0x1d16d, 0x1d17b, 0x1d185, 0x1d1aa, 0x1d242, 0xe0100
    };

    StringBuilder builder = new StringBuilder();
    int numChars = TestUtil.nextInt(random(), 100 * 1024, 1024 * 1024);
    for (int i = 0; i < numChars; ) {
      builder.append(
          WordBreak_ExtendNumLet_chars[random().nextInt(WordBreak_ExtendNumLet_chars.length)]);
      ++i;
      if (random().nextBoolean()) {
        int numFormatExtendChars = TestUtil.nextInt(random(), 1, 8);
        for (int j = 0; j < numFormatExtendChars; ++j) {
          int codepoint;
          if (random().nextBoolean()) {
            codepoint = WordBreak_Format_chars[random().nextInt(WordBreak_Format_chars.length)];
          } else {
            codepoint = WordBreak_Extend_chars[random().nextInt(WordBreak_Extend_chars.length)];
          }
          char[] chars = Character.toChars(codepoint);
          builder.append(chars);
          i += chars.length;
        }
      }
    }
    StandardTokenizer ts = new StandardTokenizer();
    ts.setReader(new StringReader(builder.toString()));
    ts.reset();
    while (ts.incrementToken()) {}
    ts.end();
    ts.close();

    int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
    ts.setMaxTokenLength(newBufferSize); // try a different buffer size
    ts.setReader(new StringReader(builder.toString()));
    ts.reset();
    while (ts.incrementToken()) {}
    ts.end();
    ts.close();
  }