Example #1
0
 @Test
 public void shouldSupportPOSIXExp() {
   /*
    * The java regexp does not support this feature.
    */
   Matcher matcher = RegExpUtils.getMatcher("[:lower:]", getAllUnicode());
   RegExpUtils.printFound(matcher, 200);
   matcher = RegExpUtils.getMatcher("[:alpha:]", getAllUnicode());
   RegExpUtils.printFound(matcher, 200);
 }
Example #2
0
  /**
   * If supported, the partial application of a mode is achieved with a regex construct that looks
   * like (?i) to turn on case-insensitive matching, or (?-i) to turn it off. Some flavors also
   * support (?i:⋯) and (?-i:⋯), which turn on and off case-insensitive matching for the
   * subexpression enclosed.
   */
  @Test
  public void testCaseSensitive() {
    String fortest = "html";
    String fortest2 = "HTML";

    Matcher matcher =
        RegExpUtils.getMatcher(
            "(?i:html)", fortest, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
    assertTrue(matcher.matches());
    matcher.reset();
    RegExpUtils.printFound(matcher, 200);

    matcher.reset(fortest2);
    assertTrue(matcher.matches());
    matcher.reset();
    RegExpUtils.printFound(matcher, 200);
  }
Example #3
0
  @Test
  public void testDigitalMatcher() {

    Matcher matcher = RegExpUtils.getMatcher("[\\d]", "12345d");

    assertEquals(5, RegExpUtils.countFound(matcher));

    matcher = RegExpUtils.getMatcher("\\D", "12345d");

    assertEquals(1, RegExpUtils.countFound(matcher));

    matcher = RegExpUtils.getMatcher("\\D", "12345d");
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
  }
Example #4
0
  @Test
  public void shouldPrintLanguages() {
    /*
     * The simplified chinese character using the \p{InCJKUnifiedIdeographs}
     * match!The traditional chinese using the
     * \p{InCJKCompatibilityIdeographs} match!
     */

    String[] languages =
        new String[] {
          "\\p{InAlphabeticPresentationForms}",
          "\\p{InArabic}",
          "\\p{InArabicPresentationFormsA}",
          "\\p{InArabicPresentationFormsB}",
          "\\p{InArabicSupplement}",
          "\\p{InArmenian}",
          "\\p{InArrows}",
          "\\p{InBalinese}",
          "\\p{InBasicLatin}",
          "\\p{InBengali}",
          "\\p{InBlockElements}",
          "\\p{InBopomofo}",
          "\\p{InBopomofoExtended}",
          "\\p{InBoxDrawing}",
          "\\p{InBraillePatterns}",
          "\\p{InBuginese}",
          "\\p{InBuhid}",
          "\\p{InCham}",
          "\\p{InCherokee}",
          "\\p{InCJKCompatibility}",
          "\\p{InCJKCompatibilityForms}",
          "\\p{InCJKCompatibilityIdeographs}",
          "\\p{InCJKRadicalsSupplement}",
          "\\p{InCJKStrokes}",
          "\\p{InCJKSymbolsandPunctuation}",
          "\\p{InCJKUnifiedIdeographs}",
          "\\p{InCJKUnifiedIdeographsExtensionA}",
          "\\p{InCombiningDiacriticalMarks}",
          "\\p{InCombiningDiacriticalMarksforSymbols}",
          "\\p{InCombiningDiacriticalMarksSupplement}",
          "\\p{InCombiningHalfMarks}",
          "\\p{InControlPictures}",
          "\\p{InCoptic}",
          "\\p{InCurrencySymbols}",
          "\\p{InCyrillic}",
          "\\p{InCyrillicExtendedA}",
          "\\p{InCyrillicExtendedB}",
          "\\p{InCyrillicSupplement}",
          "\\p{InDevanagari}",
          "\\p{InDingbats}",
          "\\p{InEnclosedAlphanumerics}",
          "\\p{InEnclosedCJKLettersandMonths}",
          "\\p{InEthiopic}",
          "\\p{InEthiopicExtended}",
          "\\p{InEthiopicSupplement}",
          "\\p{InGeneralPunctuation}",
          "\\p{InGeometricShapes}",
          "\\p{InGeorgian}",
          "\\p{InGeorgianSupplement}",
          "\\p{InGlagolitic}",
          "\\p{InGreekandCoptic}",
          "\\p{InGreekExtended}",
          "\\p{InGujarati}",
          "\\p{InGurmukhi}",
          "\\p{InHalfwidthandFullwidthForms}",
          "\\p{InHangulCompatibilityJamo}",
          "\\p{InHangulJamo}",
          "\\p{InHangulSyllables}",
          "\\p{InHanunoo}",
          "\\p{InHebrew}",
          "\\p{InHighPrivateUseSurrogates}",
          "\\p{InHighSurrogates}",
          "\\p{InHiragana}",
          "\\p{InIdeographicDescriptionCharacters}",
          "\\p{InIPAExtensions}",
          "\\p{InKanbun}",
          "\\p{InKangxiRadicals}",
          "\\p{InKannada}",
          "\\p{InKatakana}",
          "\\p{InKatakanaPhoneticExtensions}",
          "\\p{InKayahLi}",
          "\\p{InKhmer}",
          "\\p{InKhmerSymbols}",
          "\\p{InLao}",
          "\\p{InLatin1Supplement}",
          "\\p{InLatinExtendedA}",
          "\\p{InLatinExtendedAdditional}",
          "\\p{InLatinExtendedB}",
          "\\p{InLatinExtendedC}",
          "\\p{InLatinExtendedD}",
          "\\p{InLepcha}",
          "\\p{InLetterlikeSymbols}",
          "\\p{InLimbu}",
          "\\p{InLowSurrogates}",
          "\\p{InMalayalam}",
          "\\p{InMathematicalOperators}",
          "\\p{InMiscellaneousMathematicalSymbolsA}",
          "\\p{InMiscellaneousMathematicalSymbolsB}",
          "\\p{InMiscellaneousSymbols}",
          "\\p{InMiscellaneousSymbolsandArrows}",
          "\\p{InMiscellaneousTechnical}",
          "\\p{InModifierToneLetters}",
          "\\p{InMongolian}",
          "\\p{InMyanmar}",
          "\\p{InNewTaiLue}",
          "\\p{InNKo}",
          "\\p{InNumberForms}",
          "\\p{InOgham}",
          "\\p{InOlChiki}",
          "\\p{InOpticalCharacterRecognition}",
          "\\p{InOriya}",
          "\\p{InPhagspa}",
          "\\p{InPhoneticExtensions}",
          "\\p{InPhoneticExtensionsSupplement}",
          "\\p{InPrivateUseArea}",
          "\\p{InRejang}",
          "\\p{InRunic}",
          "\\p{InSaurashtra}",
          "\\p{InSinhala}",
          "\\p{InSmallFormVariants}",
          "\\p{InSpacingModifierLetters}",
          "\\p{InSpecials}",
          "\\p{InSundanese}",
          "\\p{InSuperscriptsandSubscripts}",
          "\\p{InSupplementalArrowsA}",
          "\\p{InSupplementalArrowsB}",
          "\\p{InSupplementalMathematicalOperators}",
          "\\p{InSupplementalPunctuation}",
          "\\p{InSylotiNagri}",
          "\\p{InSyriac}",
          "\\p{InTagalog}",
          "\\p{InTagbanwa}",
          "\\p{InTaiLe}",
          "\\p{InTamil}",
          "\\p{InTelugu}",
          "\\p{InThaana}",
          "\\p{InThai}",
          "\\p{InTibetan}",
          "\\p{InTifinagh}",
          "\\p{InUnifiedCanadianAboriginalSyllabics}",
          "\\p{InVai}",
          "\\p{InVariationSelectors}",
          "\\p{InVerticalForms}",
          "\\p{InYijingHexagramSymbols}",
          "\\p{InYiRadicals}",
          "\\p{InYiSyllables}"
        };

    Matcher matcher = null;
    for (String s : languages) {
      try {
        matcher = RegExpUtils.getMatcher(s, getAllUnicode());
        RegExpUtils.printFound(matcher, 200);
      } catch (Exception e) {
        System.out.println("!!!not supported:" + s);
      }
    }
  }
Example #5
0
  @Test
  public void testMatchProperties() {
    /*
     * \p{Lowercase_Letter} Lowercase letters.
     */
    Matcher matcher = RegExpUtils.getMatcher("\\p{Ll}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Uppercase_Letter} Uppercase letters.
     */
    matcher = RegExpUtils.getMatcher("\\p{Lu}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Titlecase_Letter} Letters that appear at the start of a word
     * (e.g., the character D is the title case of the lowercase d and of
     * the uppercase D).
     */
    matcher = RegExpUtils.getMatcher("\\p{Lt}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    try {
      /*
       * A composite shorthand matching all \p{Ll}, \p{Lu}, and \p{Lt}
       * characters.
       */
      matcher = RegExpUtils.getMatcher("\\p{L&}", getAllUnicode());
      fail("The java not support this pattern");
    } catch (Exception e) {

    }
    /*
     * \p{Letter} Things considered letters.
     */
    matcher = RegExpUtils.getMatcher("\\p{L}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Mark} Various characters that are not meant to appear by
     * themselves, but with other base characters (accent marks, enclosing
     * boxes, ...).
     */
    matcher = RegExpUtils.getMatcher("\\p{M}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Separator} Characters that separate things, but have no visual
     * representation (various kinds of spaces ...).
     */
    matcher = RegExpUtils.getMatcher("\\p{Z}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Symbol} Various types of Dingbats and symbols.
     */
    matcher = RegExpUtils.getMatcher("\\p{S}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Number} Any kind of numeric character.
     */
    matcher = RegExpUtils.getMatcher("\\p{N}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Punctuation} Punctuation characters.
     */
    matcher = RegExpUtils.getMatcher("\\p{P}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other} Catch-all for everything else (rarely used for normal
     * characters).
     */
    matcher = RegExpUtils.getMatcher("\\p{C}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);

    /*
     * \p{Modifier_Letter} A small set of letter-like special-use
     * characters.
     */
    matcher = RegExpUtils.getMatcher("\\p{Lm}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Letter} Letters that have no case, and aren't modifiers,
     * including letters from Hebrew, Arabic, Bengali, Tibetan, Japanese,
     * ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Lo}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Non_Spacing_Mark} "Characters" that modify other characters, such
     * as accents, umlauts, certain "vowel signs," and tone marks.
     */
    matcher = RegExpUtils.getMatcher("\\p{Mn}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Spacing_Combining_Mark} Modification characters that take up space
     * of their own (mostly "vowel signs" in languages that have them,
     * including Bengali, Gujarati, Tamil, Telugu, Kannada, Malayalam,
     * Sinhala, Myanmar, and Khmer).
     */
    matcher = RegExpUtils.getMatcher("\\p{Mc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Enclosing_Mark} A small set of marks that can enclose other
     * characters, such as circles, squares, diamonds, and "keycaps."
     */
    matcher = RegExpUtils.getMatcher("\\p{Me}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Space_Separator} Various kinds of spacing characters, such as a
     * normal space, non-break space, and various spaces of specific widths.
     */
    matcher = RegExpUtils.getMatcher("\\p{Zs}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Line_Separator} The LINE SEPARATOR character (U+2028).
     */
    matcher = RegExpUtils.getMatcher("\\p{Zl}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Paragraph_Separator} The PARAGRAPH SEPARATOR character (U+2029).
     */
    matcher = RegExpUtils.getMatcher("\\p{Zp}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Math_Symbol} +, ÷, a fraction slash, , ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Sm}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Currency_Symbol} $, ¢, ¥, €, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Sc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Modifier_Symbol} Mostly versions of the combining characters, but
     * as full-fledged characters in their own right.
     */
    matcher = RegExpUtils.getMatcher("\\p{Sk}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Symbol} Various Dingbats, box-drawing symbols, Braille
     * patterns, non-letter Chinese characters
     */
    matcher = RegExpUtils.getMatcher("\\p{So}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Decimal_Digit_Number} Zero through nine, in various scripts (not
     * including Chinese, Japanese, and Korean).
     */
    matcher = RegExpUtils.getMatcher("\\p{Nd}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Letter_Number} Mostly Roman numerals.
     */
    matcher = RegExpUtils.getMatcher("\\p{Nl}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Number} Numbers as superscripts or symbols; characters
     * representing numbers that aren't digits (Chinese, Japanese, and
     * Korean not included).
     */
    matcher = RegExpUtils.getMatcher("\\p{No}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Dash_Punctuation} Hyphens and dashes of all sorts.
     */
    matcher = RegExpUtils.getMatcher("\\p{Pd}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Open_Punctuation} Characters like (, ︽, and 《, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Ps}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Close_Punctuation} Characters like), 》, 》, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Pe}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Initial_Punctuation} Characters like , ', <, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Pi}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Final_Punctuation} Characters like , ', >, ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Pf}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Connector_Punctuation} A few punctuation characters with special
     * linguistic meaning, such as an underscore.
     */
    matcher = RegExpUtils.getMatcher("\\p{Pc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Other_Punctuation} Catch-all for other punctuation: !, &, ., :, :,
     * ...
     */
    matcher = RegExpUtils.getMatcher("\\p{Po}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Control} The ASCII and Latin-1 control characters (TAB, LF, CR,
     * ...)
     */
    matcher = RegExpUtils.getMatcher("\\p{Cc}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Format} Non-visible characters intended to indicate some basic
     * formatting(zero width joiner, activate Arabic form shaping, ...)
     */
    matcher = RegExpUtils.getMatcher("\\p{Cf}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Private_Use} Code points allocated for private use (company logos,
     * etc.).
     */
    matcher = RegExpUtils.getMatcher("\\p{Co}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
    /*
     * \p{Unassigned} Code points that have no characters assigned.
     */
    matcher = RegExpUtils.getMatcher("\\p{Cn}", getAllUnicode());
    RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH);
  }
Example #6
0
 @Test
 public void checkThaiCharacters() {
   Matcher m = RegExpUtils.getMatcher("[\\p{InThai}]", getAllUnicode());
   RegExpUtils.printFound(m, OUTPUT_STRING_WIDTH);
 }