Example #1
0
  public void TestDefaultRuleBasedSentenceIteration() {
    logln("Testing the RBBI for sentence iteration using default rules");
    RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) BreakIterator.getSentenceInstance();

    // fetch the rules used to create the above RuleBasedBreakIterator
    String defaultRules = rbbi.toString();
    RuleBasedBreakIterator sentIterDefault = null;
    try {
      sentIterDefault = new RuleBasedBreakIterator(defaultRules);
    } catch (IllegalArgumentException iae) {
      errln(
          "ERROR: failed construction in TestDefaultRuleBasedSentenceIteration()" + iae.toString());
    }

    List<String> sentdata = new ArrayList<String>();
    sentdata.add("(This is it.) ");
    sentdata.add("Testing the sentence iterator. ");
    sentdata.add("\"This isn\'t it.\" ");
    sentdata.add("Hi! ");
    sentdata.add("This is a simple sample sentence. ");
    sentdata.add("(This is it.) ");
    sentdata.add("This is a simple sample sentence. ");
    sentdata.add("\"This isn\'t it.\" ");
    sentdata.add("Hi! ");
    sentdata.add("This is a simple sample sentence. ");
    sentdata.add("It does not have to make any sense as you can see. ");
    sentdata.add("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
    sentdata.add("Che la dritta via aveo smarrita. ");

    generalIteratorTest(sentIterDefault, sentdata);
  }
Example #2
0
  private List<String> _testLastAndPrevious(RuleBasedBreakIterator rbbi, String text) {
    int p = rbbi.last();
    int lastP = p;
    List<String> result = new ArrayList<String>();

    if (p != text.length()) {
      errln("last() returned " + p + " instead of " + text.length());
    }

    while (p != RuleBasedBreakIterator.DONE) {
      p = rbbi.previous();
      if (p != RuleBasedBreakIterator.DONE) {
        if (p >= lastP) {
          errln(
              "previous() failed to move backward: previous() on position "
                  + lastP
                  + " yielded "
                  + p);
        }

        result.add(0, text.substring(p, lastP));
      } else {
        if (lastP != 0) {
          errln("previous() returned DONE prematurely: offset was " + lastP + " instead of 0");
        }
      }
      lastP = p;
    }
    return result;
  }
Example #3
0
  private List<String> _testFirstAndNext(RuleBasedBreakIterator rbbi, String text) {
    int p = rbbi.first();
    int lastP = p;
    List<String> result = new ArrayList<String>();

    if (p != 0) {
      errln("first() returned " + p + " instead of 0");
    }

    while (p != RuleBasedBreakIterator.DONE) {
      p = rbbi.next();
      if (p != RuleBasedBreakIterator.DONE) {
        if (p <= lastP) {
          errln("next() failed to move forward: next() on position " + lastP + " yielded " + p);
        }
        result.add(text.substring(lastP, p));
      } else {
        if (lastP != text.length()) {
          errln(
              "next() returned DONE prematurely: offset was "
                  + lastP
                  + " instead of "
                  + text.length());
        }
      }
      lastP = p;
    }
    return result;
  }
Example #4
0
 @Override
 public int first() {
   working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
   rules.setText(working);
   workingOffset = 0;
   int first = rules.first();
   return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
 }
Example #5
0
 /*
  * Tests the method public int following(int offset)
  */
 public void TestFollowing() {
   RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
   // Tests when "else if (offset < fText.getBeginIndex())" is true
   rbbi.setText("dummy");
   if (rbbi.following(-1) != 0) {
     errln(
         "RuleBasedBreakIterator.following(-1) was suppose to return "
             + "0 when the object has a fText of dummy.");
   }
 }
Example #6
0
 /**
  * Clone method. Creates another LaoBreakIterator with the same behavior and current state as this
  * one.
  *
  * @return The clone.
  */
 @Override
 public LaoBreakIterator clone() {
   LaoBreakIterator other = (LaoBreakIterator) super.clone();
   other.rules = (RuleBasedBreakIterator) rules.clone();
   other.verify = (RuleBasedBreakIterator) verify.clone();
   if (text != null) other.text = text.clone();
   if (working != null) other.working = working.clone();
   if (verifyText != null) other.verifyText = verifyText.clone();
   return other;
 }
Example #7
0
 /*
  * Tests the method public int last()
  */
 public void TestLast() {
   RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
   // Tests when "if (fText == null)" is true
   rbbi.setText((CharacterIterator) null);
   if (rbbi.last() != BreakIterator.DONE) {
     errln(
         "RuleBasedBreakIterator.last() was suppose to return "
             + "BreakIterator.DONE when the object has a null fText.");
   }
 }
Example #8
0
 /* Tests the method public Object clone() */
 public void TestClone() {
   RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
   try {
     rbbi.setText((CharacterIterator) null);
     if (((RuleBasedBreakIterator) rbbi.clone()).getText() != null)
       errln(
           "RuleBasedBreakIterator.clone() was suppose to return "
               + "the same object because fText is set to null.");
   } catch (Exception e) {
     errln("RuleBasedBreakIterator.clone() was not suppose to return " + "an exception.");
   }
 }
Example #9
0
  public void TestDefaultRuleBasedWordIteration() {
    logln("Testing the RBBI for word iteration using default rules");
    RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) BreakIterator.getWordInstance();
    // fetch the rules used to create the above RuleBasedBreakIterator
    String defaultRules = rbbi.toString();

    RuleBasedBreakIterator wordIterDefault = null;
    try {
      wordIterDefault = new RuleBasedBreakIterator(defaultRules);
    } catch (IllegalArgumentException iae) {
      errln(
          "ERROR: failed construction in TestDefaultRuleBasedWordIteration() -- custom rules"
              + iae.toString());
    }

    List<String> worddata = new ArrayList<String>();
    worddata.add("Write");
    worddata.add(" ");
    worddata.add("wordrules");
    worddata.add(".");
    worddata.add(" ");
    // worddata.add("alpha-beta-gamma");
    worddata.add(" ");
    worddata.add("\u092f\u0939");
    worddata.add(" ");
    worddata.add("\u0939\u093f" + halfNA + "\u0926\u0940");
    worddata.add(" ");
    worddata.add("\u0939\u0948");
    // worddata.add("\u0964"); //danda followed by a space
    worddata.add(" ");
    worddata.add("\u0905\u093e\u092a");
    worddata.add(" ");
    worddata.add("\u0938\u093f\u0916\u094b\u0917\u0947");
    worddata.add("?");
    worddata.add(" ");
    worddata.add("\r");
    worddata.add("It's");
    worddata.add(" ");
    // worddata.add("$30.10");
    worddata.add(" ");
    worddata.add(" ");
    worddata.add("Badges");
    worddata.add("?");
    worddata.add(" ");
    worddata.add("BADGES");
    worddata.add("!");
    worddata.add("1000,233,456.000");
    worddata.add(" ");

    generalIteratorTest(wordIterDefault, worddata);
  }
Example #10
0
  private void doMultipleSelectionTest(RuleBasedBreakIterator iterator, String testText) {
    logln("Multiple selection test...");
    RuleBasedBreakIterator testIterator = (RuleBasedBreakIterator) iterator.clone();
    int offset = iterator.first();
    int testOffset;
    int count = 0;

    do {
      testOffset = testIterator.first();
      testOffset = testIterator.next(count);
      logln("next(" + count + ") -> " + testOffset);
      if (offset != testOffset)
        errln(
            "next(n) and next() not returning consistent results: for step "
                + count
                + ", next(n) returned "
                + testOffset
                + " and next() had "
                + offset);

      if (offset != RuleBasedBreakIterator.DONE) {
        count++;
        offset = iterator.next();
      }
    } while (offset != RuleBasedBreakIterator.DONE);

    // now do it backwards...
    offset = iterator.last();
    count = 0;

    do {
      testOffset = testIterator.last();
      testOffset = testIterator.next(count);
      logln("next(" + count + ") -> " + testOffset);
      if (offset != testOffset)
        errln(
            "next(n) and next() not returning consistent results: for step "
                + count
                + ", next(n) returned "
                + testOffset
                + " and next() had "
                + offset);

      if (offset != RuleBasedBreakIterator.DONE) {
        count--;
        offset = iterator.previous();
      }
    } while (offset != RuleBasedBreakIterator.DONE);
  }
Example #11
0
  @Override
  public void lexer_open(String text, LocaleId language, Tokens tokens) {

    if (Util.isEmpty(text)) {
      cancel();
      return;
    }
    this.text = text;

    if (iterators.containsKey(language)) {
      iterator = iterators.get(language);
    } else {
      iterator =
          (RuleBasedBreakIterator)
              BreakIterator.getWordInstance(ULocale.createCanonical(language.toString()));
      String defaultRules = iterator.toString();

      // Collect rules for the language, combine with defaultRules
      String newRules = defaultRules;

      for (LexerRule rule : getRules()) {

        boolean isInternal = Util.isEmpty(rule.getPattern());

        if (checkRule(rule, language) && !isInternal) {

          newRules =
              formatRule(
                  newRules,
                  rule.getName(),
                  rule.getDescription(),
                  rule.getPattern(),
                  rule.getLexemId());
        }
      }

      // Recreate iterator for the language(with new rules), store for future reuse
      iterator = new RuleBasedBreakIterator(newRules);
      iterators.put(language, iterator);
    }

    if (iterator == null) return;
    iterator.setText(text);

    // Sets the current iteration position to the beginning of the text
    start = iterator.first();
    end = start;
  }
Example #12
0
  @Override
  public Lexem lexer_next() {

    end = iterator.next();
    if (end == BreakIterator.DONE) return null;

    if (start >= end) return null;

    int lexemId = iterator.getRuleStatus();
    Lexem lexem = new Lexem(lexemId, text.substring(start, end), start, end);

    // System.out.println(lexem.toString());
    start = end; // Prepare for the next iteration

    return lexem;
  }
Example #13
0
 /* Tests the method public int current() */
 public void TestCurrent() {
   RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
   // Tests when "(fText != null) ? fText.getIndex() : BreakIterator.DONE" is true and false
   rbbi.setText((CharacterIterator) null);
   if (rbbi.current() != BreakIterator.DONE) {
     errln(
         "RuleBasedBreakIterator.current() was suppose to return "
             + "BreakIterator.DONE when the object has a fText of null.");
   }
   rbbi.setText("dummy");
   if (rbbi.current() != 0) {
     errln(
         "RuleBasedBreakIterator.current() was suppose to return "
             + "0 when the object has a fText of dummy.");
   }
 }
Example #14
0
  public void TestDefaultRuleBasedLineIteration() {
    logln("Testing the RBBI for line iteration using default rules");
    RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) RuleBasedBreakIterator.getLineInstance();
    // fetch the rules used to create the above RuleBasedBreakIterator
    String defaultRules = rbbi.toString();
    RuleBasedBreakIterator lineIterDefault = null;
    try {
      lineIterDefault = new RuleBasedBreakIterator(defaultRules);
    } catch (IllegalArgumentException iae) {
      errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());
    }

    List<String> linedata = new ArrayList<String>();
    linedata.add("Multi-");
    linedata.add("Level ");
    linedata.add("example ");
    linedata.add("of ");
    linedata.add("a ");
    linedata.add("semi-");
    linedata.add("idiotic ");
    linedata.add("non-");
    linedata.add("sensical ");
    linedata.add("(non-");
    linedata.add("important) ");
    linedata.add("sentence. ");

    linedata.add("Hi  ");
    linedata.add("Hello ");
    linedata.add("How\n");
    linedata.add("are\r");
    linedata.add("you" + kLineSeparator);
    linedata.add("fine.\t");
    linedata.add("good.  ");

    linedata.add("Now\r");
    linedata.add("is\n");
    linedata.add("the\r\n");
    linedata.add("time\n");
    linedata.add("\r");
    linedata.add("for\r");
    linedata.add("\r");
    linedata.add("all");

    generalIteratorTest(lineIterDefault, linedata);
  }
Example #15
0
  /*
   * Tests the method public int preceding(int offset)
   */
  public void TestPreceding() {
    RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
    // Tests when "if (fText == null || offset > fText.getEndIndex())" is true
    rbbi.setText((CharacterIterator) null);
    if (rbbi.preceding(-1) != BreakIterator.DONE) {
      errln(
          "RuleBasedBreakIterator.preceding(-1) was suppose to return "
              + "0 when the object has a fText of null.");
    }

    // Tests when "else if (offset < fText.getBeginIndex())" is true
    rbbi.setText("dummy");
    if (rbbi.preceding(-1) != 0) {
      errln(
          "RuleBasedBreakIterator.preceding(-1) was suppose to return "
              + "0 when the object has a fText of dummy.");
    }
  }
Example #16
0
 @Override
 public void setText(CharacterIterator text) {
   if (!(text instanceof CharArrayIterator))
     throw new UnsupportedOperationException("unsupported CharacterIterator");
   this.text = (CharArrayIterator) text;
   ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
   working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
   rules.setText(working);
   workingOffset = 0;
 }
Example #17
0
  @Override
  public int next() {
    int current = current();
    int next = rules.next();
    if (next == BreakIterator.DONE) return next;
    else next += workingOffset;

    char c = working.current();
    int following = rules.next(); // lookahead
    if (following != BreakIterator.DONE) {
      following += workingOffset;
      if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
        workingOffset = next - 1;
        working.setText(
            text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
        return next - 1;
      }
      rules.previous(); // undo the lookahead
    }

    return next;
  }
Example #18
0
 private void _testIsBoundary(RuleBasedBreakIterator rbbi, String text, int[] boundaries) {
   logln("testIsBoundary():");
   int p = 1;
   boolean isB;
   for (int i = 0; i <= text.length(); i++) {
     isB = rbbi.isBoundary(i);
     logln("rbbi.isBoundary(" + i + ") -> " + isB);
     if (i == boundaries[p]) {
       if (!isB) errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
       ++p;
     } else {
       if (isB) errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
     }
   }
 }
Example #19
0
  /*
   * Tests the method public int first()
   */
  public void TestFirst() {
    RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
    // Tests when "if (fText == null)" is true
    rbbi.setText((CharacterIterator) null);
    assertEquals("RuleBasedBreakIterator.first()", BreakIterator.DONE, rbbi.first());

    rbbi.setText("abc");
    assertEquals("RuleBasedBreakIterator.first()", 0, rbbi.first());
    assertEquals("RuleBasedBreakIterator.next()", 1, rbbi.next());
  }
Example #20
0
  /*
   * Tests the method public boolean equals(Object that)
   */
  public void TestEquals() {
    RuleBasedBreakIterator rbbi = new RuleBasedBreakIterator(".;");
    RuleBasedBreakIterator rbbi1 = new RuleBasedBreakIterator(".;");

    // TODO: Tests when "if (fRData != other.fRData && (fRData == null || other.fRData == null))" is
    // true

    // Tests when "if (fText == null || other.fText == null)" is true
    rbbi.setText((CharacterIterator) null);
    if (rbbi.equals(rbbi1)) {
      errln(
          "RuleBasedBreakIterator.equals(Object) was not suppose to return "
              + "true when the other object has a null fText.");
    }

    // Tests when "if (fText == null && other.fText == null)" is true
    rbbi1.setText((CharacterIterator) null);
    if (!rbbi.equals(rbbi1)) {
      errln(
          "RuleBasedBreakIterator.equals(Object) was not suppose to return "
              + "false when both objects has a null fText.");
    }

    // Tests when an exception occurs
    if (rbbi.equals(0)) {
      errln(
          "RuleBasedBreakIterator.equals(Object) was suppose to return "
              + "false when comparing to integer 0.");
    }
    if (rbbi.equals(0.0)) {
      errln(
          "RuleBasedBreakIterator.equals(Object) was suppose to return "
              + "false when comparing to float 0.0.");
    }
    if (rbbi.equals("0")) {
      errln(
          "RuleBasedBreakIterator.equals(Object) was suppose to return "
              + "false when comparing to string '0'.");
    }
  }
Example #21
0
 private void _testPreceding(RuleBasedBreakIterator rbbi, String text, int[] boundaries) {
   logln("testPreceding():");
   int p = 0;
   for (int i = 0; i <= text.length(); i++) {
     int b = rbbi.preceding(i);
     logln("rbbi.preceding(" + i + ") -> " + b);
     if (b != boundaries[p])
       errln(
           "Wrong result from preceding() for "
               + i
               + ": expected "
               + boundaries[p]
               + ", got "
               + b);
     if (i == boundaries[p + 1]) ++p;
   }
 }
Example #22
0
  private boolean verifyPushBack(int current, int next) {
    int shortenedSyllable = next - current - 1;

    verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
    verify.setText(verifyText);
    if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0) return false;

    verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
    verify.setText(verifyText);

    return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
  }
Example #23
0
  private void generalIteratorTest(RuleBasedBreakIterator rbbi, List<String> expectedResult) {
    StringBuffer buffer = new StringBuffer();
    String text;
    for (int i = 0; i < expectedResult.size(); i++) {
      text = expectedResult.get(i);
      buffer.append(text);
    }
    text = buffer.toString();
    if (rbbi == null) {
      errln("null iterator, test skipped.");
      return;
    }

    rbbi.setText(text);

    List<String> nextResults = _testFirstAndNext(rbbi, text);
    List<String> previousResults = _testLastAndPrevious(rbbi, text);

    logln("comparing forward and backward...");
    int errs = getErrorCount();
    compareFragmentLists("forward iteration", "backward iteration", nextResults, previousResults);
    if (getErrorCount() == errs) {
      logln("comparing expected and actual...");
      compareFragmentLists("expected result", "actual result", expectedResult, nextResults);
    }

    int[] boundaries = new int[expectedResult.size() + 3];
    boundaries[0] = RuleBasedBreakIterator.DONE;
    boundaries[1] = 0;
    for (int i = 0; i < expectedResult.size(); i++) {
      boundaries[i + 2] = boundaries[i + 1] + (expectedResult.get(i).length());
    }

    boundaries[boundaries.length - 1] = RuleBasedBreakIterator.DONE;

    _testFollowing(rbbi, text, boundaries);
    _testPreceding(rbbi, text, boundaries);
    _testIsBoundary(rbbi, text, boundaries);

    doMultipleSelectionTest(rbbi, text);
  }
Example #24
0
 @Override
 public int current() {
   int current = rules.current();
   return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
 }
Example #25
0
 /**
  * Creates a new iterator, performing the backtracking verification across the provided <code>
  * rules</code>.
  */
 public LaoBreakIterator(RuleBasedBreakIterator rules) {
   this.rules = (RuleBasedBreakIterator) rules.clone();
   this.verify = (RuleBasedBreakIterator) rules.clone();
 }
Example #26
0
  // tests default rules based character iteration.
  // Builds a new iterator from the source rules in the default (prebuilt) iterator.
  //
  public void TestDefaultRuleBasedCharacterIteration() {
    RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance();
    logln("Testing the RBBI for character iteration by using default rules");

    // fetch the rules used to create the above RuleBasedBreakIterator
    String defaultRules = rbbi.toString();

    RuleBasedBreakIterator charIterDefault = null;
    try {
      charIterDefault = new RuleBasedBreakIterator(defaultRules);
    } catch (IllegalArgumentException iae) {
      errln(
          "ERROR: failed construction in TestDefaultRuleBasedCharacterIteration()"
              + iae.toString());
    }

    List<String> chardata = new ArrayList<String>();
    chardata.add("H");
    chardata.add("e");
    chardata.add("l");
    chardata.add("l");
    chardata.add("o");
    chardata.add("e\u0301"); // acuteE
    chardata.add("&");
    chardata.add("e\u0303"); // tildaE
    // devanagiri characters for Hindi support
    chardata.add("\u0906"); // devanagiri AA
    // chardata.add("\u093e\u0901"); //devanagiri vowelsign AA+ chandrabindhu
    chardata.add("\u0916\u0947"); // devanagiri KHA+vowelsign E
    chardata.add("\u0938\u0941\u0902"); // devanagiri SA+vowelsign U + anusvara(bindu)
    chardata.add("\u0926"); // devanagiri consonant DA
    chardata.add("\u0930"); // devanagiri consonant RA
    // chardata.add("\u0939\u094c"); //devanagiri HA+vowel sign AI
    chardata.add("\u0964"); // devanagiri danda
    // end hindi characters
    chardata.add("A\u0302"); // circumflexA
    chardata.add("i\u0301"); // acuteBelowI
    // conjoining jamo...
    chardata.add("\u1109\u1161\u11bc");
    chardata.add("\u1112\u1161\u11bc");
    chardata.add("\n");
    chardata.add("\r\n"); // keep CRLF sequences together
    chardata.add("S\u0300"); // graveS
    chardata.add("i\u0301"); // acuteBelowI
    chardata.add("!");

    // What follows is a string of Korean characters (I found it in the Yellow Pages
    // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
    // it correctly), first as precomposed syllables, and then as conjoining jamo.
    // Both sequences should be semantically identical and break the same way.
    // precomposed syllables...
    chardata.add("\uc0c1");
    chardata.add("\ud56d");
    chardata.add(" ");
    chardata.add("\ud55c");
    chardata.add("\uc778");
    chardata.add(" ");
    chardata.add("\uc5f0");
    chardata.add("\ud569");
    chardata.add(" ");
    chardata.add("\uc7a5");
    chardata.add("\ub85c");
    chardata.add("\uad50");
    chardata.add("\ud68c");
    chardata.add(" ");
    // conjoining jamo...
    chardata.add("\u1109\u1161\u11bc");
    chardata.add("\u1112\u1161\u11bc");
    chardata.add(" ");
    chardata.add("\u1112\u1161\u11ab");
    chardata.add("\u110b\u1175\u11ab");
    chardata.add(" ");
    chardata.add("\u110b\u1167\u11ab");
    chardata.add("\u1112\u1161\u11b8");
    chardata.add(" ");
    chardata.add("\u110c\u1161\u11bc");
    chardata.add("\u1105\u1169");
    chardata.add("\u1100\u116d");
    chardata.add("\u1112\u116c");

    generalIteratorTest(charIterDefault, chardata);
  }