Beispiel #1
0
  @Test
  public void wordMatch() {

    // The "/w" just support[a-zA-Z0-9],do not support any unicode
    // characters.

    Matcher matcher = RegExpUtils.getMatcher("[^\\w]*\\w+", "中国");
    assertFalse(matcher.matches());
    matcher = RegExpUtils.getMatcher("[^\\w]*\\w+", "中国abs中国abs");

    List<int[]> l = RegExpUtils.getAllMatchBoundsAsList(matcher);
    assertEquals(l.size(), 2);
    assertEquals(0, ((l.get(0)))[0]);
    assertEquals(5, ((l.get(0)))[1]);
    assertEquals(5, ((l.get(1)))[0]);
    assertEquals(10, ((l.get(1)))[1]);

    Matcher matcher2 = RegExpUtils.getMatcher(".*\\w+", "中国abs中国abs");
    /*
     * The matcher will treat the left most as the highest priority.
     */
    assertTrue(matcher2.find());
    assertEquals(0, matcher2.start());
    assertEquals(10, matcher2.end());
  }
Beispiel #2
0
 @Test
 public void shouldSupportPOSIXExp() {
   /*
    * The java regexp does not support this feature.
    */
   Matcher matcher = RegExpUtils.getMatcher("[:lower:]", getAllUnicode());
   RegExpUtils.printFound(matcher, 200);
   matcher = RegExpUtils.getMatcher("[:alpha:]", getAllUnicode());
   RegExpUtils.printFound(matcher, 200);
 }
Beispiel #3
0
 /**
  * If you wish one search to pick up where the last one left off you can use the "\G" pattern
  * element. If the string hasn't been searched before, then "\G" matches the beginning of the
  * String.
  *
  * <pre>
  * Regex r = new Regex(&quot;\\Gfoo&quot;);
  * String x = &quot;foofoo foo&quot;;
  * System.out.println(r.search(x));
  * System.out.println(r.search(x));
  * System.out.println(r.search(x));
  * // Prints true, true, false.
  * </pre>
  */
 @Test
 public void testG() {
   Matcher matcher = RegExpUtils.getMatcher("\\G[a-z]", "abcDDde");
   List<int[]> result = RegExpUtils.getAllMatchBoundsAsList(matcher);
   assertEquals(result.size(), 3);
   assertEquals(result.get(0)[0], 0);
   assertEquals(result.get(0)[1], 1);
   assertEquals(result.get(1)[0], 1);
   assertEquals(result.get(1)[1], 2);
   assertEquals(result.get(2)[0], 2);
   assertEquals(result.get(2)[1], 3);
 }
Beispiel #4
0
  /**
   * If supported, the partial application of a mode is achieved with a regex construct that looks
   * like (?i) to turn on case-insensitive matching, or (?-i) to turn it off. Some flavors also
   * support (?i:⋯) and (?-i:⋯), which turn on and off case-insensitive matching for the
   * subexpression enclosed.
   */
  @Test
  public void testCaseSensitive() {
    String fortest = "html";
    String fortest2 = "HTML";

    Matcher matcher =
        RegExpUtils.getMatcher(
            "(?i:html)", fortest, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
    assertTrue(matcher.matches());
    matcher.reset();
    RegExpUtils.printFound(matcher, 200);

    matcher.reset(fortest2);
    assertTrue(matcher.matches());
    matcher.reset();
    RegExpUtils.printFound(matcher, 200);
  }
Beispiel #5
0
  @Test
  public void shouldSupportTheUnicodeProperties() {
    /*
     * Regular-expression support for these qualities varies, but many
     * Unicode-enabled programs support matching via at least some of them
     * with \p{quality} (matches characters that have the quality) and
     * \P{quality} (matches characters without it). A basic example is
     * \p{L}, where 'L' is the quality meaning "letter" (as opposed to
     * number, punctuation, accents, etc.). 'L' is an example of a general
     * property (also called a category). We'll soon see other "qualities"
     * that can be tested by \p{⋯} and \P{⋯}, but the most commonly
     * supported are the general properties.
     */

    Matcher matcher = RegExpUtils.getMatcher("\\p{IsL}", "中");

    assertTrue(matcher.find());
  }
Beispiel #6
0
  @Test
  public void testDigitalMatcher() {

    Matcher matcher = RegExpUtils.getMatcher("[\\d]", "12345d");

    assertEquals(5, RegExpUtils.countFound(matcher));

    matcher = RegExpUtils.getMatcher("\\D", "12345d");

    assertEquals(1, RegExpUtils.countFound(matcher));

    matcher = RegExpUtils.getMatcher("\\D", "12345d");
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
  }
Beispiel #7
0
  @Test
  public void testLineModes() {
    /*
     * Has enhanced line-anchor mode
     */
    String forTest =
        "to be a great man ,you must pay great effort\nAnd you will meet the most difficult things like the bitch\n";
    /*
     * ^ matches at start of string
     */

    long found = RegExpUtils.countFound(RegExpUtils.getMatcher("^", forTest));
    assertEquals(1L, found);
    /*
     * $ matches at end of string,and matches before string-ending newline.
     */
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("$", forTest));
    assertEquals(2L, found);
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("$", forTest));
    assertEquals(2L, found);

    /*
     * In enhanced line-anchor mode ... ^ matches at start of string
     *
     * ^ matches after any newline $ matches at end of string $ matches
     * before any newline \A always matches like normal ^ \Z always matches
     * like normal $ \z always matches only at end of string
     */
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\Z", forTest));
    assertEquals(2L, found);
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\A", forTest));
    assertEquals(1L, found);
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\z", forTest));
    assertEquals(1L, found);

    // multiline mode
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("$", forTest, Pattern.MULTILINE));
    assertEquals(3L, found);
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\A", forTest, Pattern.MULTILINE));
    assertEquals(1L, found);
    /*
     * Always like the normal '$'
     */
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\Z", forTest, Pattern.MULTILINE));
    assertEquals(2L, found);
    /*
     * Always matches end of String.
     */
    found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\z", forTest, Pattern.MULTILINE));
    assertEquals(1L, found);
  }
Beispiel #8
0
  @Test
  public void shouldPrintLanguages() {
    /*
     * The simplified chinese character using the \p{InCJKUnifiedIdeographs}
     * match!The traditional chinese using the
     * \p{InCJKCompatibilityIdeographs} match!
     */

    String[] languages =
        new String[] {
          "\\p{InAlphabeticPresentationForms}",
          "\\p{InArabic}",
          "\\p{InArabicPresentationFormsA}",
          "\\p{InArabicPresentationFormsB}",
          "\\p{InArabicSupplement}",
          "\\p{InArmenian}",
          "\\p{InArrows}",
          "\\p{InBalinese}",
          "\\p{InBasicLatin}",
          "\\p{InBengali}",
          "\\p{InBlockElements}",
          "\\p{InBopomofo}",
          "\\p{InBopomofoExtended}",
          "\\p{InBoxDrawing}",
          "\\p{InBraillePatterns}",
          "\\p{InBuginese}",
          "\\p{InBuhid}",
          "\\p{InCham}",
          "\\p{InCherokee}",
          "\\p{InCJKCompatibility}",
          "\\p{InCJKCompatibilityForms}",
          "\\p{InCJKCompatibilityIdeographs}",
          "\\p{InCJKRadicalsSupplement}",
          "\\p{InCJKStrokes}",
          "\\p{InCJKSymbolsandPunctuation}",
          "\\p{InCJKUnifiedIdeographs}",
          "\\p{InCJKUnifiedIdeographsExtensionA}",
          "\\p{InCombiningDiacriticalMarks}",
          "\\p{InCombiningDiacriticalMarksforSymbols}",
          "\\p{InCombiningDiacriticalMarksSupplement}",
          "\\p{InCombiningHalfMarks}",
          "\\p{InControlPictures}",
          "\\p{InCoptic}",
          "\\p{InCurrencySymbols}",
          "\\p{InCyrillic}",
          "\\p{InCyrillicExtendedA}",
          "\\p{InCyrillicExtendedB}",
          "\\p{InCyrillicSupplement}",
          "\\p{InDevanagari}",
          "\\p{InDingbats}",
          "\\p{InEnclosedAlphanumerics}",
          "\\p{InEnclosedCJKLettersandMonths}",
          "\\p{InEthiopic}",
          "\\p{InEthiopicExtended}",
          "\\p{InEthiopicSupplement}",
          "\\p{InGeneralPunctuation}",
          "\\p{InGeometricShapes}",
          "\\p{InGeorgian}",
          "\\p{InGeorgianSupplement}",
          "\\p{InGlagolitic}",
          "\\p{InGreekandCoptic}",
          "\\p{InGreekExtended}",
          "\\p{InGujarati}",
          "\\p{InGurmukhi}",
          "\\p{InHalfwidthandFullwidthForms}",
          "\\p{InHangulCompatibilityJamo}",
          "\\p{InHangulJamo}",
          "\\p{InHangulSyllables}",
          "\\p{InHanunoo}",
          "\\p{InHebrew}",
          "\\p{InHighPrivateUseSurrogates}",
          "\\p{InHighSurrogates}",
          "\\p{InHiragana}",
          "\\p{InIdeographicDescriptionCharacters}",
          "\\p{InIPAExtensions}",
          "\\p{InKanbun}",
          "\\p{InKangxiRadicals}",
          "\\p{InKannada}",
          "\\p{InKatakana}",
          "\\p{InKatakanaPhoneticExtensions}",
          "\\p{InKayahLi}",
          "\\p{InKhmer}",
          "\\p{InKhmerSymbols}",
          "\\p{InLao}",
          "\\p{InLatin1Supplement}",
          "\\p{InLatinExtendedA}",
          "\\p{InLatinExtendedAdditional}",
          "\\p{InLatinExtendedB}",
          "\\p{InLatinExtendedC}",
          "\\p{InLatinExtendedD}",
          "\\p{InLepcha}",
          "\\p{InLetterlikeSymbols}",
          "\\p{InLimbu}",
          "\\p{InLowSurrogates}",
          "\\p{InMalayalam}",
          "\\p{InMathematicalOperators}",
          "\\p{InMiscellaneousMathematicalSymbolsA}",
          "\\p{InMiscellaneousMathematicalSymbolsB}",
          "\\p{InMiscellaneousSymbols}",
          "\\p{InMiscellaneousSymbolsandArrows}",
          "\\p{InMiscellaneousTechnical}",
          "\\p{InModifierToneLetters}",
          "\\p{InMongolian}",
          "\\p{InMyanmar}",
          "\\p{InNewTaiLue}",
          "\\p{InNKo}",
          "\\p{InNumberForms}",
          "\\p{InOgham}",
          "\\p{InOlChiki}",
          "\\p{InOpticalCharacterRecognition}",
          "\\p{InOriya}",
          "\\p{InPhagspa}",
          "\\p{InPhoneticExtensions}",
          "\\p{InPhoneticExtensionsSupplement}",
          "\\p{InPrivateUseArea}",
          "\\p{InRejang}",
          "\\p{InRunic}",
          "\\p{InSaurashtra}",
          "\\p{InSinhala}",
          "\\p{InSmallFormVariants}",
          "\\p{InSpacingModifierLetters}",
          "\\p{InSpecials}",
          "\\p{InSundanese}",
          "\\p{InSuperscriptsandSubscripts}",
          "\\p{InSupplementalArrowsA}",
          "\\p{InSupplementalArrowsB}",
          "\\p{InSupplementalMathematicalOperators}",
          "\\p{InSupplementalPunctuation}",
          "\\p{InSylotiNagri}",
          "\\p{InSyriac}",
          "\\p{InTagalog}",
          "\\p{InTagbanwa}",
          "\\p{InTaiLe}",
          "\\p{InTamil}",
          "\\p{InTelugu}",
          "\\p{InThaana}",
          "\\p{InThai}",
          "\\p{InTibetan}",
          "\\p{InTifinagh}",
          "\\p{InUnifiedCanadianAboriginalSyllabics}",
          "\\p{InVai}",
          "\\p{InVariationSelectors}",
          "\\p{InVerticalForms}",
          "\\p{InYijingHexagramSymbols}",
          "\\p{InYiRadicals}",
          "\\p{InYiSyllables}"
        };

    Matcher matcher = null;
    for (String s : languages) {
      try {
        matcher = RegExpUtils.getMatcher(s, getAllUnicode());
        RegExpUtils.printFound(matcher, 200);
      } catch (Exception e) {
        System.out.println("!!!not supported:" + s);
      }
    }
  }
Beispiel #9
0
  @Test
  public void testMatchProperties() {
    /*
     * \p{Lowercase_Letter} Lowercase letters.
     */
    Matcher matcher = RegExpUtils.getMatcher("\\p{Ll}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Uppercase_Letter} Uppercase letters.
     */
    matcher = RegExpUtils.getMatcher("\\p{Lu}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Titlecase_Letter} Letters that appear at the start of a word
     * (e.g., the character D is the title case of the lowercase d and of
     * the uppercase D).
     */
    matcher = RegExpUtils.getMatcher("\\p{Lt}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    try {
      /*
       * A composite shorthand matching all \p{Ll}, \p{Lu}, and \p{Lt}
       * characters.
       */
      matcher = RegExpUtils.getMatcher("\\p{L&}", getAllUnicode());
      fail("The java not support this pattern");
    } catch (Exception e) {

    }
    /*
     * \p{Letter} Things considered letters.
     */
    matcher = RegExpUtils.getMatcher("\\p{L}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Mark} Various characters that are not meant to appear by
     * themselves, but with other base characters (accent marks, enclosing
     * boxes, ...).
     */
    matcher = RegExpUtils.getMatcher("\\p{M}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Separator} Characters that separate things, but have no visual
     * representation (various kinds of spaces ...).
     */
    matcher = RegExpUtils.getMatcher("\\p{Z}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Symbol} Various types of Dingbats and symbols.
     */
    matcher = RegExpUtils.getMatcher("\\p{S}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Number} Any kind of numeric character.
     */
    matcher = RegExpUtils.getMatcher("\\p{N}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Punctuation} Punctuation characters.
     */
    matcher = RegExpUtils.getMatcher("\\p{P}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other} Catch-all for everything else (rarely used for normal
     * characters).
     */
    matcher = RegExpUtils.getMatcher("\\p{C}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);

    /*
     * \p{Modifier_Letter} A small set of letter-like special-use
     * characters.
     */
    matcher = RegExpUtils.getMatcher("\\p{Lm}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Letter} Letters that have no case, and aren't modifiers,
     * including letters from Hebrew, Arabic, Bengali, Tibetan, Japanese,
     * ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Lo}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Non_Spacing_Mark} "Characters" that modify other characters, such
     * as accents, umlauts, certain "vowel signs," and tone marks.
     */
    matcher = RegExpUtils.getMatcher("\\p{Mn}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Spacing_Combining_Mark} Modification characters that take up space
     * of their own (mostly "vowel signs" in languages that have them,
     * including Bengali, Gujarati, Tamil, Telugu, Kannada, Malayalam,
     * Sinhala, Myanmar, and Khmer).
     */
    matcher = RegExpUtils.getMatcher("\\p{Mc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Enclosing_Mark} A small set of marks that can enclose other
     * characters, such as circles, squares, diamonds, and "keycaps."
     */
    matcher = RegExpUtils.getMatcher("\\p{Me}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Space_Separator} Various kinds of spacing characters, such as a
     * normal space, non-break space, and various spaces of specific widths.
     */
    matcher = RegExpUtils.getMatcher("\\p{Zs}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Line_Separator} The LINE SEPARATOR character (U+2028).
     */
    matcher = RegExpUtils.getMatcher("\\p{Zl}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Paragraph_Separator} The PARAGRAPH SEPARATOR character (U+2029).
     */
    matcher = RegExpUtils.getMatcher("\\p{Zp}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Math_Symbol} +, ÷, a fraction slash, , ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Sm}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Currency_Symbol} $, ¢, ¥, €, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Sc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Modifier_Symbol} Mostly versions of the combining characters, but
     * as full-fledged characters in their own right.
     */
    matcher = RegExpUtils.getMatcher("\\p{Sk}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Symbol} Various Dingbats, box-drawing symbols, Braille
     * patterns, non-letter Chinese characters
     */
    matcher = RegExpUtils.getMatcher("\\p{So}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Decimal_Digit_Number} Zero through nine, in various scripts (not
     * including Chinese, Japanese, and Korean).
     */
    matcher = RegExpUtils.getMatcher("\\p{Nd}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Letter_Number} Mostly Roman numerals.
     */
    matcher = RegExpUtils.getMatcher("\\p{Nl}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Number} Numbers as superscripts or symbols; characters
     * representing numbers that aren't digits (Chinese, Japanese, and
     * Korean not included).
     */
    matcher = RegExpUtils.getMatcher("\\p{No}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Dash_Punctuation} Hyphens and dashes of all sorts.
     */
    matcher = RegExpUtils.getMatcher("\\p{Pd}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Open_Punctuation} Characters like (, ︽, and 《, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Ps}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Close_Punctuation} Characters like), 》, 》, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Pe}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Initial_Punctuation} Characters like , ', <, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Pi}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Final_Punctuation} Characters like , ', >, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Pf}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Connector_Punctuation} A few punctuation characters with special
     * linguistic meaning, such as an underscore.
     */
    matcher = RegExpUtils.getMatcher("\\p{Pc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Punctuation} Catch-all for other punctuation: !, &, ., :, :,
     * ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Po}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Control} The ASCII and Latin-1 control characters (TAB, LF, CR,
     * ...)
     */
    matcher = RegExpUtils.getMatcher("\\p{Cc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Format} Non-visible characters intended to indicate some basic
     * formatting(zero width joiner, activate Arabic form shaping, ...)
     */
    matcher = RegExpUtils.getMatcher("\\p{Cf}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Private_Use} Code points allocated for private use (company logos,
     * etc.).
     */
    matcher = RegExpUtils.getMatcher("\\p{Co}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Unassigned} Code points that have no characters assigned.
     */
    matcher = RegExpUtils.getMatcher("\\p{Cn}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
  }
Beispiel #10
0
  @Test
  public void defaultMatchModeShouldBeMultiLine() {

    Matcher matcher = RegExpUtils.getMatcher("$", "\n");
    assertEquals(2L, RegExpUtils.countFound(matcher));
  }
Beispiel #11
0
 @Test
 public void checkThaiCharacters() {
   Matcher m = RegExpUtils.getMatcher("[\\p{InThai}]", getAllUnicode());
   RegExpUtils.printFound(m, OUTPUT_STRING_WIDTH);
 }