private void verify(List<String> tokens, boolean[] isHashtag) { for (int i = 0; i < tokens.size(); i++) { assertTrue(stream.incrementToken()); assertEquals(tokens.get(i), termAttr.getTermString()); assertEquals(isHashtag[i], TokenType.HASHTAG.equals(typeAttr.getType())); } assertFalse(stream.incrementToken()); }
@Before public void setup() { stream = new HashtagTokenCombiner( // This toknizes text into alphabet-only, number-only tokens // and '#', '_'. new RegexExtractor.Builder() .setRegexPattern(Pattern.compile("([0-9]+|[a-zA-Z]+|\\p{InKatakana}+|#|_)")) .build()); termAttr = stream.getAttribute(CharSequenceTermAttribute.class); typeAttr = stream.getAttribute(TokenTypeAttribute.class); }
@Test public void testMultipleHashtags() { String text = "#this #is #a #hashtag"; stream.reset(text); verify( ImmutableList.of("#this", "#is", "#a", "#hashtag"), new boolean[] {true, true, true, true}); }
@Test public void testNotHashtag() { String text = "this is also not#hashtag"; stream.reset(text); verify( ImmutableList.of("this", "is", "also", "not", "#", "hashtag"), new boolean[] {false, false, false, false, false, false}); }
@Test public void testJAHashtag() { String text = "this is now #ハッシュタグ"; stream.reset(text); verify( ImmutableList.of("this", "is", "now", "#ハッシュタグ"), new boolean[] {false, false, false, true}); }
@Test public void testSingleHashtag() { String text = "this is a #hashtag"; stream.reset(text); verify( ImmutableList.of("this", "is", "a", "#hashtag"), new boolean[] {false, false, false, true}); }
@Test public void testHashtagWithMultipleTokens() { String text = "#hash_tag_123"; stream.reset(text); verify(ImmutableList.of("#hash_tag_123"), new boolean[] {true}); }