@Test public void shouldSupportPOSIXExp() { /* * The java regexp does not support this feature. */ Matcher matcher = RegExpUtils.getMatcher("[:lower:]", getAllUnicode()); RegExpUtils.printFound(matcher, 200); matcher = RegExpUtils.getMatcher("[:alpha:]", getAllUnicode()); RegExpUtils.printFound(matcher, 200); }
/** * If supported, the partial application of a mode is achieved with a regex construct that looks * like (?i) to turn on case-insensitive matching, or (?-i) to turn it off. Some flavors also * support (?i:⋯) and (?-i:⋯), which turn on and off case-insensitive matching for the * subexpression enclosed. */ @Test public void testCaseSensitive() { String fortest = "html"; String fortest2 = "HTML"; Matcher matcher = RegExpUtils.getMatcher( "(?i:html)", fortest, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); assertTrue(matcher.matches()); matcher.reset(); RegExpUtils.printFound(matcher, 200); matcher.reset(fortest2); assertTrue(matcher.matches()); matcher.reset(); RegExpUtils.printFound(matcher, 200); }
@Test public void testDigitalMatcher() { Matcher matcher = RegExpUtils.getMatcher("[\\d]", "12345d"); assertEquals(5, RegExpUtils.countFound(matcher)); matcher = RegExpUtils.getMatcher("\\D", "12345d"); assertEquals(1, RegExpUtils.countFound(matcher)); matcher = RegExpUtils.getMatcher("\\D", "12345d"); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); }
@Test public void shouldPrintLanguages() { /* * The simplified chinese character using the \p{InCJKUnifiedIdeographs} * match!The traditional chinese using the * \p{InCJKCompatibilityIdeographs} match! */ String[] languages = new String[] { "\\p{InAlphabeticPresentationForms}", "\\p{InArabic}", "\\p{InArabicPresentationFormsA}", "\\p{InArabicPresentationFormsB}", "\\p{InArabicSupplement}", "\\p{InArmenian}", "\\p{InArrows}", "\\p{InBalinese}", "\\p{InBasicLatin}", "\\p{InBengali}", "\\p{InBlockElements}", "\\p{InBopomofo}", "\\p{InBopomofoExtended}", "\\p{InBoxDrawing}", "\\p{InBraillePatterns}", "\\p{InBuginese}", "\\p{InBuhid}", "\\p{InCham}", "\\p{InCherokee}", "\\p{InCJKCompatibility}", "\\p{InCJKCompatibilityForms}", "\\p{InCJKCompatibilityIdeographs}", "\\p{InCJKRadicalsSupplement}", "\\p{InCJKStrokes}", "\\p{InCJKSymbolsandPunctuation}", "\\p{InCJKUnifiedIdeographs}", "\\p{InCJKUnifiedIdeographsExtensionA}", "\\p{InCombiningDiacriticalMarks}", "\\p{InCombiningDiacriticalMarksforSymbols}", "\\p{InCombiningDiacriticalMarksSupplement}", "\\p{InCombiningHalfMarks}", "\\p{InControlPictures}", "\\p{InCoptic}", "\\p{InCurrencySymbols}", "\\p{InCyrillic}", "\\p{InCyrillicExtendedA}", "\\p{InCyrillicExtendedB}", "\\p{InCyrillicSupplement}", "\\p{InDevanagari}", "\\p{InDingbats}", "\\p{InEnclosedAlphanumerics}", "\\p{InEnclosedCJKLettersandMonths}", "\\p{InEthiopic}", "\\p{InEthiopicExtended}", "\\p{InEthiopicSupplement}", "\\p{InGeneralPunctuation}", "\\p{InGeometricShapes}", "\\p{InGeorgian}", "\\p{InGeorgianSupplement}", "\\p{InGlagolitic}", "\\p{InGreekandCoptic}", "\\p{InGreekExtended}", "\\p{InGujarati}", "\\p{InGurmukhi}", "\\p{InHalfwidthandFullwidthForms}", "\\p{InHangulCompatibilityJamo}", "\\p{InHangulJamo}", "\\p{InHangulSyllables}", "\\p{InHanunoo}", "\\p{InHebrew}", "\\p{InHighPrivateUseSurrogates}", "\\p{InHighSurrogates}", "\\p{InHiragana}", "\\p{InIdeographicDescriptionCharacters}", "\\p{InIPAExtensions}", "\\p{InKanbun}", "\\p{InKangxiRadicals}", "\\p{InKannada}", "\\p{InKatakana}", "\\p{InKatakanaPhoneticExtensions}", "\\p{InKayahLi}", "\\p{InKhmer}", "\\p{InKhmerSymbols}", "\\p{InLao}", "\\p{InLatin1Supplement}", "\\p{InLatinExtendedA}", "\\p{InLatinExtendedAdditional}", "\\p{InLatinExtendedB}", "\\p{InLatinExtendedC}", "\\p{InLatinExtendedD}", "\\p{InLepcha}", "\\p{InLetterlikeSymbols}", "\\p{InLimbu}", "\\p{InLowSurrogates}", "\\p{InMalayalam}", "\\p{InMathematicalOperators}", "\\p{InMiscellaneousMathematicalSymbolsA}", "\\p{InMiscellaneousMathematicalSymbolsB}", "\\p{InMiscellaneousSymbols}", "\\p{InMiscellaneousSymbolsandArrows}", "\\p{InMiscellaneousTechnical}", "\\p{InModifierToneLetters}", "\\p{InMongolian}", "\\p{InMyanmar}", "\\p{InNewTaiLue}", "\\p{InNKo}", "\\p{InNumberForms}", "\\p{InOgham}", "\\p{InOlChiki}", "\\p{InOpticalCharacterRecognition}", "\\p{InOriya}", "\\p{InPhagspa}", "\\p{InPhoneticExtensions}", "\\p{InPhoneticExtensionsSupplement}", "\\p{InPrivateUseArea}", "\\p{InRejang}", "\\p{InRunic}", "\\p{InSaurashtra}", "\\p{InSinhala}", "\\p{InSmallFormVariants}", "\\p{InSpacingModifierLetters}", "\\p{InSpecials}", "\\p{InSundanese}", "\\p{InSuperscriptsandSubscripts}", "\\p{InSupplementalArrowsA}", "\\p{InSupplementalArrowsB}", "\\p{InSupplementalMathematicalOperators}", "\\p{InSupplementalPunctuation}", "\\p{InSylotiNagri}", "\\p{InSyriac}", "\\p{InTagalog}", "\\p{InTagbanwa}", "\\p{InTaiLe}", "\\p{InTamil}", "\\p{InTelugu}", "\\p{InThaana}", "\\p{InThai}", "\\p{InTibetan}", "\\p{InTifinagh}", "\\p{InUnifiedCanadianAboriginalSyllabics}", "\\p{InVai}", "\\p{InVariationSelectors}", "\\p{InVerticalForms}", "\\p{InYijingHexagramSymbols}", "\\p{InYiRadicals}", "\\p{InYiSyllables}" }; Matcher matcher = null; for (String s : languages) { try { matcher = RegExpUtils.getMatcher(s, getAllUnicode()); RegExpUtils.printFound(matcher, 200); } catch (Exception e) { System.out.println("!!!not supported:" + s); } } }
@Test public void testMatchProperties() { /* * \p{Lowercase_Letter} Lowercase letters. */ Matcher matcher = RegExpUtils.getMatcher("\\p{Ll}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Uppercase_Letter} Uppercase letters. */ matcher = RegExpUtils.getMatcher("\\p{Lu}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Titlecase_Letter} Letters that appear at the start of a word * (e.g., the character D is the title case of the lowercase d and of * the uppercase D). */ matcher = RegExpUtils.getMatcher("\\p{Lt}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); try { /* * A composite shorthand matching all \p{Ll}, \p{Lu}, and \p{Lt} * characters. */ matcher = RegExpUtils.getMatcher("\\p{L&}", getAllUnicode()); fail("The java not support this pattern"); } catch (Exception e) { } /* * \p{Letter} Things considered letters. */ matcher = RegExpUtils.getMatcher("\\p{L}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Mark} Various characters that are not meant to appear by * themselves, but with other base characters (accent marks, enclosing * boxes, ...). */ matcher = RegExpUtils.getMatcher("\\p{M}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Separator} Characters that separate things, but have no visual * representation (various kinds of spaces ...). */ matcher = RegExpUtils.getMatcher("\\p{Z}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Symbol} Various types of Dingbats and symbols. */ matcher = RegExpUtils.getMatcher("\\p{S}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Number} Any kind of numeric character. */ matcher = RegExpUtils.getMatcher("\\p{N}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Punctuation} Punctuation characters. */ matcher = RegExpUtils.getMatcher("\\p{P}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other} Catch-all for everything else (rarely used for normal * characters). */ matcher = RegExpUtils.getMatcher("\\p{C}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Modifier_Letter} A small set of letter-like special-use * characters. */ matcher = RegExpUtils.getMatcher("\\p{Lm}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Letter} Letters that have no case, and aren't modifiers, * including letters from Hebrew, Arabic, Bengali, Tibetan, Japanese, * ... */ matcher = RegExpUtils.getMatcher("\\p{Lo}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Non_Spacing_Mark} "Characters" that modify other characters, such * as accents, umlauts, certain "vowel signs," and tone marks. */ matcher = RegExpUtils.getMatcher("\\p{Mn}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Spacing_Combining_Mark} Modification characters that take up space * of their own (mostly "vowel signs" in languages that have them, * including Bengali, Gujarati, Tamil, Telugu, Kannada, Malayalam, * Sinhala, Myanmar, and Khmer). */ matcher = RegExpUtils.getMatcher("\\p{Mc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Enclosing_Mark} A small set of marks that can enclose other * characters, such as circles, squares, diamonds, and "keycaps." */ matcher = RegExpUtils.getMatcher("\\p{Me}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Space_Separator} Various kinds of spacing characters, such as a * normal space, non-break space, and various spaces of specific widths. */ matcher = RegExpUtils.getMatcher("\\p{Zs}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Line_Separator} The LINE SEPARATOR character (U+2028). */ matcher = RegExpUtils.getMatcher("\\p{Zl}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Paragraph_Separator} The PARAGRAPH SEPARATOR character (U+2029). */ matcher = RegExpUtils.getMatcher("\\p{Zp}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Math_Symbol} +, ÷, a fraction slash, , ... */ matcher = RegExpUtils.getMatcher("\\p{Sm}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Currency_Symbol} $, ¢, ¥, €, ... */ matcher = RegExpUtils.getMatcher("\\p{Sc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Modifier_Symbol} Mostly versions of the combining characters, but * as full-fledged characters in their own right. */ matcher = RegExpUtils.getMatcher("\\p{Sk}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Symbol} Various Dingbats, box-drawing symbols, Braille * patterns, non-letter Chinese characters */ matcher = RegExpUtils.getMatcher("\\p{So}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Decimal_Digit_Number} Zero through nine, in various scripts (not * including Chinese, Japanese, and Korean). */ matcher = RegExpUtils.getMatcher("\\p{Nd}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Letter_Number} Mostly Roman numerals. */ matcher = RegExpUtils.getMatcher("\\p{Nl}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Number} Numbers as superscripts or symbols; characters * representing numbers that aren't digits (Chinese, Japanese, and * Korean not included). */ matcher = RegExpUtils.getMatcher("\\p{No}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Dash_Punctuation} Hyphens and dashes of all sorts. */ matcher = RegExpUtils.getMatcher("\\p{Pd}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Open_Punctuation} Characters like (, ︽, and 《, ... */ matcher = RegExpUtils.getMatcher("\\p{Ps}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Close_Punctuation} Characters like), 》, 》, ... */ matcher = RegExpUtils.getMatcher("\\p{Pe}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Initial_Punctuation} Characters like , ', <, ... */ matcher = RegExpUtils.getMatcher("\\p{Pi}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Final_Punctuation} Characters like , ', >, ... */ matcher = RegExpUtils.getMatcher("\\p{Pf}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Connector_Punctuation} A few punctuation characters with special * linguistic meaning, such as an underscore. */ matcher = RegExpUtils.getMatcher("\\p{Pc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Punctuation} Catch-all for other punctuation: !, &, ., :, :, * ... */ matcher = RegExpUtils.getMatcher("\\p{Po}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Control} The ASCII and Latin-1 control characters (TAB, LF, CR, * ...) */ matcher = RegExpUtils.getMatcher("\\p{Cc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Format} Non-visible characters intended to indicate some basic * formatting(zero width joiner, activate Arabic form shaping, ...) */ matcher = RegExpUtils.getMatcher("\\p{Cf}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Private_Use} Code points allocated for private use (company logos, * etc.). */ matcher = RegExpUtils.getMatcher("\\p{Co}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Unassigned} Code points that have no characters assigned. */ matcher = RegExpUtils.getMatcher("\\p{Cn}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); }
@Test public void checkThaiCharacters() { Matcher m = RegExpUtils.getMatcher("[\\p{InThai}]", getAllUnicode()); RegExpUtils.printFound(m, OUTPUT_STRING_WIDTH); }