@Test public void wordMatch() { // The "/w" just support[a-zA-Z0-9],do not support any unicode // characters. Matcher matcher = RegExpUtils.getMatcher("[^\\w]*\\w+", "中国"); assertFalse(matcher.matches()); matcher = RegExpUtils.getMatcher("[^\\w]*\\w+", "中国abs中国abs"); List<int[]> l = RegExpUtils.getAllMatchBoundsAsList(matcher); assertEquals(l.size(), 2); assertEquals(0, ((l.get(0)))[0]); assertEquals(5, ((l.get(0)))[1]); assertEquals(5, ((l.get(1)))[0]); assertEquals(10, ((l.get(1)))[1]); Matcher matcher2 = RegExpUtils.getMatcher(".*\\w+", "中国abs中国abs"); /* * The matcher will treat the left most as the highest priority. */ assertTrue(matcher2.find()); assertEquals(0, matcher2.start()); assertEquals(10, matcher2.end()); }
@Test public void shouldSupportPOSIXExp() { /* * The java regexp does not support this feature. */ Matcher matcher = RegExpUtils.getMatcher("[:lower:]", getAllUnicode()); RegExpUtils.printFound(matcher, 200); matcher = RegExpUtils.getMatcher("[:alpha:]", getAllUnicode()); RegExpUtils.printFound(matcher, 200); }
/** * If you wish one search to pick up where the last one left off you can use the "\G" pattern * element. If the string hasn't been searched before, then "\G" matches the beginning of the * String. * * <pre> * Regex r = new Regex("\\Gfoo"); * String x = "foofoo foo"; * System.out.println(r.search(x)); * System.out.println(r.search(x)); * System.out.println(r.search(x)); * // Prints true, true, false. * </pre> */ @Test public void testG() { Matcher matcher = RegExpUtils.getMatcher("\\G[a-z]", "abcDDde"); List<int[]> result = RegExpUtils.getAllMatchBoundsAsList(matcher); assertEquals(result.size(), 3); assertEquals(result.get(0)[0], 0); assertEquals(result.get(0)[1], 1); assertEquals(result.get(1)[0], 1); assertEquals(result.get(1)[1], 2); assertEquals(result.get(2)[0], 2); assertEquals(result.get(2)[1], 3); }
/** * If supported, the partial application of a mode is achieved with a regex construct that looks * like (?i) to turn on case-insensitive matching, or (?-i) to turn it off. Some flavors also * support (?i:⋯) and (?-i:⋯), which turn on and off case-insensitive matching for the * subexpression enclosed. */ @Test public void testCaseSensitive() { String fortest = "html"; String fortest2 = "HTML"; Matcher matcher = RegExpUtils.getMatcher( "(?i:html)", fortest, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); assertTrue(matcher.matches()); matcher.reset(); RegExpUtils.printFound(matcher, 200); matcher.reset(fortest2); assertTrue(matcher.matches()); matcher.reset(); RegExpUtils.printFound(matcher, 200); }
@Test public void shouldSupportTheUnicodeProperties() { /* * Regular-expression support for these qualities varies, but many * Unicode-enabled programs support matching via at least some of them * with \p{quality} (matches characters that have the quality) and * \P{quality} (matches characters without it). A basic example is * \p{L}, where 'L' is the quality meaning "letter" (as opposed to * number, punctuation, accents, etc.). 'L' is an example of a general * property (also called a category). We'll soon see other "qualities" * that can be tested by \p{⋯} and \P{⋯}, but the most commonly * supported are the general properties. */ Matcher matcher = RegExpUtils.getMatcher("\\p{IsL}", "中"); assertTrue(matcher.find()); }
@Test public void testDigitalMatcher() { Matcher matcher = RegExpUtils.getMatcher("[\\d]", "12345d"); assertEquals(5, RegExpUtils.countFound(matcher)); matcher = RegExpUtils.getMatcher("\\D", "12345d"); assertEquals(1, RegExpUtils.countFound(matcher)); matcher = RegExpUtils.getMatcher("\\D", "12345d"); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); }
@Test public void testLineModes() { /* * Has enhanced line-anchor mode */ String forTest = "to be a great man ,you must pay great effort\nAnd you will meet the most difficult things like the bitch\n"; /* * ^ matches at start of string */ long found = RegExpUtils.countFound(RegExpUtils.getMatcher("^", forTest)); assertEquals(1L, found); /* * $ matches at end of string,and matches before string-ending newline. */ found = RegExpUtils.countFound(RegExpUtils.getMatcher("$", forTest)); assertEquals(2L, found); found = RegExpUtils.countFound(RegExpUtils.getMatcher("$", forTest)); assertEquals(2L, found); /* * In enhanced line-anchor mode ... ^ matches at start of string * * ^ matches after any newline $ matches at end of string $ matches * before any newline \A always matches like normal ^ \Z always matches * like normal $ \z always matches only at end of string */ found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\Z", forTest)); assertEquals(2L, found); found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\A", forTest)); assertEquals(1L, found); found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\z", forTest)); assertEquals(1L, found); // multiline mode found = RegExpUtils.countFound(RegExpUtils.getMatcher("$", forTest, Pattern.MULTILINE)); assertEquals(3L, found); found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\A", forTest, Pattern.MULTILINE)); assertEquals(1L, found); /* * Always like the normal '$' */ found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\Z", forTest, Pattern.MULTILINE)); assertEquals(2L, found); /* * Always matches end of String. */ found = RegExpUtils.countFound(RegExpUtils.getMatcher("\\z", forTest, Pattern.MULTILINE)); assertEquals(1L, found); }
@Test public void shouldPrintLanguages() { /* * The simplified chinese character using the \p{InCJKUnifiedIdeographs} * match!The traditional chinese using the * \p{InCJKCompatibilityIdeographs} match! */ String[] languages = new String[] { "\\p{InAlphabeticPresentationForms}", "\\p{InArabic}", "\\p{InArabicPresentationFormsA}", "\\p{InArabicPresentationFormsB}", "\\p{InArabicSupplement}", "\\p{InArmenian}", "\\p{InArrows}", "\\p{InBalinese}", "\\p{InBasicLatin}", "\\p{InBengali}", "\\p{InBlockElements}", "\\p{InBopomofo}", "\\p{InBopomofoExtended}", "\\p{InBoxDrawing}", "\\p{InBraillePatterns}", "\\p{InBuginese}", "\\p{InBuhid}", "\\p{InCham}", "\\p{InCherokee}", "\\p{InCJKCompatibility}", "\\p{InCJKCompatibilityForms}", "\\p{InCJKCompatibilityIdeographs}", "\\p{InCJKRadicalsSupplement}", "\\p{InCJKStrokes}", "\\p{InCJKSymbolsandPunctuation}", "\\p{InCJKUnifiedIdeographs}", "\\p{InCJKUnifiedIdeographsExtensionA}", "\\p{InCombiningDiacriticalMarks}", "\\p{InCombiningDiacriticalMarksforSymbols}", "\\p{InCombiningDiacriticalMarksSupplement}", "\\p{InCombiningHalfMarks}", "\\p{InControlPictures}", "\\p{InCoptic}", "\\p{InCurrencySymbols}", "\\p{InCyrillic}", "\\p{InCyrillicExtendedA}", "\\p{InCyrillicExtendedB}", "\\p{InCyrillicSupplement}", "\\p{InDevanagari}", "\\p{InDingbats}", "\\p{InEnclosedAlphanumerics}", "\\p{InEnclosedCJKLettersandMonths}", "\\p{InEthiopic}", "\\p{InEthiopicExtended}", "\\p{InEthiopicSupplement}", "\\p{InGeneralPunctuation}", "\\p{InGeometricShapes}", "\\p{InGeorgian}", "\\p{InGeorgianSupplement}", "\\p{InGlagolitic}", "\\p{InGreekandCoptic}", "\\p{InGreekExtended}", "\\p{InGujarati}", "\\p{InGurmukhi}", "\\p{InHalfwidthandFullwidthForms}", "\\p{InHangulCompatibilityJamo}", "\\p{InHangulJamo}", "\\p{InHangulSyllables}", "\\p{InHanunoo}", "\\p{InHebrew}", "\\p{InHighPrivateUseSurrogates}", "\\p{InHighSurrogates}", "\\p{InHiragana}", "\\p{InIdeographicDescriptionCharacters}", "\\p{InIPAExtensions}", "\\p{InKanbun}", "\\p{InKangxiRadicals}", "\\p{InKannada}", "\\p{InKatakana}", "\\p{InKatakanaPhoneticExtensions}", "\\p{InKayahLi}", "\\p{InKhmer}", "\\p{InKhmerSymbols}", "\\p{InLao}", "\\p{InLatin1Supplement}", "\\p{InLatinExtendedA}", "\\p{InLatinExtendedAdditional}", "\\p{InLatinExtendedB}", "\\p{InLatinExtendedC}", "\\p{InLatinExtendedD}", "\\p{InLepcha}", "\\p{InLetterlikeSymbols}", "\\p{InLimbu}", "\\p{InLowSurrogates}", "\\p{InMalayalam}", "\\p{InMathematicalOperators}", "\\p{InMiscellaneousMathematicalSymbolsA}", "\\p{InMiscellaneousMathematicalSymbolsB}", "\\p{InMiscellaneousSymbols}", "\\p{InMiscellaneousSymbolsandArrows}", "\\p{InMiscellaneousTechnical}", "\\p{InModifierToneLetters}", "\\p{InMongolian}", "\\p{InMyanmar}", "\\p{InNewTaiLue}", "\\p{InNKo}", "\\p{InNumberForms}", "\\p{InOgham}", "\\p{InOlChiki}", "\\p{InOpticalCharacterRecognition}", "\\p{InOriya}", "\\p{InPhagspa}", "\\p{InPhoneticExtensions}", "\\p{InPhoneticExtensionsSupplement}", "\\p{InPrivateUseArea}", "\\p{InRejang}", "\\p{InRunic}", "\\p{InSaurashtra}", "\\p{InSinhala}", "\\p{InSmallFormVariants}", "\\p{InSpacingModifierLetters}", "\\p{InSpecials}", "\\p{InSundanese}", "\\p{InSuperscriptsandSubscripts}", "\\p{InSupplementalArrowsA}", "\\p{InSupplementalArrowsB}", "\\p{InSupplementalMathematicalOperators}", "\\p{InSupplementalPunctuation}", "\\p{InSylotiNagri}", "\\p{InSyriac}", "\\p{InTagalog}", "\\p{InTagbanwa}", "\\p{InTaiLe}", "\\p{InTamil}", "\\p{InTelugu}", "\\p{InThaana}", "\\p{InThai}", "\\p{InTibetan}", "\\p{InTifinagh}", "\\p{InUnifiedCanadianAboriginalSyllabics}", "\\p{InVai}", "\\p{InVariationSelectors}", "\\p{InVerticalForms}", "\\p{InYijingHexagramSymbols}", "\\p{InYiRadicals}", "\\p{InYiSyllables}" }; Matcher matcher = null; for (String s : languages) { try { matcher = RegExpUtils.getMatcher(s, getAllUnicode()); RegExpUtils.printFound(matcher, 200); } catch (Exception e) { System.out.println("!!!not supported:" + s); } } }
@Test public void testMatchProperties() { /* * \p{Lowercase_Letter} Lowercase letters. */ Matcher matcher = RegExpUtils.getMatcher("\\p{Ll}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Uppercase_Letter} Uppercase letters. */ matcher = RegExpUtils.getMatcher("\\p{Lu}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Titlecase_Letter} Letters that appear at the start of a word * (e.g., the character D is the title case of the lowercase d and of * the uppercase D). */ matcher = RegExpUtils.getMatcher("\\p{Lt}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); try { /* * A composite shorthand matching all \p{Ll}, \p{Lu}, and \p{Lt} * characters. */ matcher = RegExpUtils.getMatcher("\\p{L&}", getAllUnicode()); fail("The java not support this pattern"); } catch (Exception e) { } /* * \p{Letter} Things considered letters. */ matcher = RegExpUtils.getMatcher("\\p{L}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Mark} Various characters that are not meant to appear by * themselves, but with other base characters (accent marks, enclosing * boxes, ...). */ matcher = RegExpUtils.getMatcher("\\p{M}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Separator} Characters that separate things, but have no visual * representation (various kinds of spaces ...). */ matcher = RegExpUtils.getMatcher("\\p{Z}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Symbol} Various types of Dingbats and symbols. */ matcher = RegExpUtils.getMatcher("\\p{S}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Number} Any kind of numeric character. */ matcher = RegExpUtils.getMatcher("\\p{N}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Punctuation} Punctuation characters. */ matcher = RegExpUtils.getMatcher("\\p{P}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other} Catch-all for everything else (rarely used for normal * characters). */ matcher = RegExpUtils.getMatcher("\\p{C}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Modifier_Letter} A small set of letter-like special-use * characters. */ matcher = RegExpUtils.getMatcher("\\p{Lm}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Letter} Letters that have no case, and aren't modifiers, * including letters from Hebrew, Arabic, Bengali, Tibetan, Japanese, * ... */ matcher = RegExpUtils.getMatcher("\\p{Lo}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Non_Spacing_Mark} "Characters" that modify other characters, such * as accents, umlauts, certain "vowel signs," and tone marks. */ matcher = RegExpUtils.getMatcher("\\p{Mn}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Spacing_Combining_Mark} Modification characters that take up space * of their own (mostly "vowel signs" in languages that have them, * including Bengali, Gujarati, Tamil, Telugu, Kannada, Malayalam, * Sinhala, Myanmar, and Khmer). */ matcher = RegExpUtils.getMatcher("\\p{Mc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Enclosing_Mark} A small set of marks that can enclose other * characters, such as circles, squares, diamonds, and "keycaps." */ matcher = RegExpUtils.getMatcher("\\p{Me}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Space_Separator} Various kinds of spacing characters, such as a * normal space, non-break space, and various spaces of specific widths. */ matcher = RegExpUtils.getMatcher("\\p{Zs}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Line_Separator} The LINE SEPARATOR character (U+2028). */ matcher = RegExpUtils.getMatcher("\\p{Zl}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Paragraph_Separator} The PARAGRAPH SEPARATOR character (U+2029). */ matcher = RegExpUtils.getMatcher("\\p{Zp}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Math_Symbol} +, ÷, a fraction slash, , ... */ matcher = RegExpUtils.getMatcher("\\p{Sm}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Currency_Symbol} $, ¢, ¥, €, ... */ matcher = RegExpUtils.getMatcher("\\p{Sc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Modifier_Symbol} Mostly versions of the combining characters, but * as full-fledged characters in their own right. */ matcher = RegExpUtils.getMatcher("\\p{Sk}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Symbol} Various Dingbats, box-drawing symbols, Braille * patterns, non-letter Chinese characters */ matcher = RegExpUtils.getMatcher("\\p{So}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Decimal_Digit_Number} Zero through nine, in various scripts (not * including Chinese, Japanese, and Korean). */ matcher = RegExpUtils.getMatcher("\\p{Nd}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Letter_Number} Mostly Roman numerals. */ matcher = RegExpUtils.getMatcher("\\p{Nl}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Number} Numbers as superscripts or symbols; characters * representing numbers that aren't digits (Chinese, Japanese, and * Korean not included). */ matcher = RegExpUtils.getMatcher("\\p{No}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Dash_Punctuation} Hyphens and dashes of all sorts. */ matcher = RegExpUtils.getMatcher("\\p{Pd}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Open_Punctuation} Characters like (, ︽, and 《, ... */ matcher = RegExpUtils.getMatcher("\\p{Ps}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Close_Punctuation} Characters like), 》, 》, ... */ matcher = RegExpUtils.getMatcher("\\p{Pe}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Initial_Punctuation} Characters like , ', <, ... */ matcher = RegExpUtils.getMatcher("\\p{Pi}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Final_Punctuation} Characters like , ', >, ... */ matcher = RegExpUtils.getMatcher("\\p{Pf}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Connector_Punctuation} A few punctuation characters with special * linguistic meaning, such as an underscore. */ matcher = RegExpUtils.getMatcher("\\p{Pc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Other_Punctuation} Catch-all for other punctuation: !, &, ., :, :, * ... */ matcher = RegExpUtils.getMatcher("\\p{Po}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Control} The ASCII and Latin-1 control characters (TAB, LF, CR, * ...) */ matcher = RegExpUtils.getMatcher("\\p{Cc}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Format} Non-visible characters intended to indicate some basic * formatting(zero width joiner, activate Arabic form shaping, ...) */ matcher = RegExpUtils.getMatcher("\\p{Cf}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Private_Use} Code points allocated for private use (company logos, * etc.). */ matcher = RegExpUtils.getMatcher("\\p{Co}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); /* * \p{Unassigned} Code points that have no characters assigned. */ matcher = RegExpUtils.getMatcher("\\p{Cn}", getAllUnicode()); RegExpUtils.printFound(matcher, OUTPUT_STRING_WIDTH); }
@Test public void defaultMatchModeShouldBeMultiLine() { Matcher matcher = RegExpUtils.getMatcher("$", "\n"); assertEquals(2L, RegExpUtils.countFound(matcher)); }
@Test public void checkThaiCharacters() { Matcher m = RegExpUtils.getMatcher("[\\p{InThai}]", getAllUnicode()); RegExpUtils.printFound(m, OUTPUT_STRING_WIDTH); }