Пример #1
0
 private void verify(List<String> tokens, boolean[] isHashtag) {
   for (int i = 0; i < tokens.size(); i++) {
     assertTrue(stream.incrementToken());
     assertEquals(tokens.get(i), termAttr.getTermString());
     assertEquals(isHashtag[i], TokenType.HASHTAG.equals(typeAttr.getType()));
   }
   assertFalse(stream.incrementToken());
 }
Пример #2
0
 @Before
 public void setup() {
   stream =
       new HashtagTokenCombiner(
           // This toknizes text into alphabet-only, number-only tokens
           // and '#', '_'.
           new RegexExtractor.Builder()
               .setRegexPattern(Pattern.compile("([0-9]+|[a-zA-Z]+|\\p{InKatakana}+|#|_)"))
               .build());
   termAttr = stream.getAttribute(CharSequenceTermAttribute.class);
   typeAttr = stream.getAttribute(TokenTypeAttribute.class);
 }
Пример #3
0
 @Test
 public void testMultipleHashtags() {
   String text = "#this #is #a #hashtag";
   stream.reset(text);
   verify(
       ImmutableList.of("#this", "#is", "#a", "#hashtag"), new boolean[] {true, true, true, true});
 }
Пример #4
0
 @Test
 public void testNotHashtag() {
   String text = "this is also not#hashtag";
   stream.reset(text);
   verify(
       ImmutableList.of("this", "is", "also", "not", "#", "hashtag"),
       new boolean[] {false, false, false, false, false, false});
 }
Пример #5
0
 @Test
 public void testJAHashtag() {
   String text = "this is now #ハッシュタグ";
   stream.reset(text);
   verify(
       ImmutableList.of("this", "is", "now", "#ハッシュタグ"),
       new boolean[] {false, false, false, true});
 }
Пример #6
0
  @Test
  public void testSingleHashtag() {
    String text = "this is a #hashtag";
    stream.reset(text);

    verify(
        ImmutableList.of("this", "is", "a", "#hashtag"), new boolean[] {false, false, false, true});
  }
Пример #7
0
 @Test
 public void testHashtagWithMultipleTokens() {
   String text = "#hash_tag_123";
   stream.reset(text);
   verify(ImmutableList.of("#hash_tag_123"), new boolean[] {true});
 }