예제 #1
0
  public void check(String name, String in, String[] out, BreakIterator bi, TestHarness harness) {
    harness.checkPoint(name);
    bi.setText(in);

    int index = 0;
    int from = bi.current();
    harness.check(from, 0);

    while (true) {
      int to = bi.next();
      if (to == BreakIterator.DONE) break;
      harness.check(in.substring(from, to), out[index]);
      ++index;
      from = to;
    }

    harness.check(index, out.length);

    harness.checkPoint("backwards " + name);
    bi.last();
    index = out.length - 1;
    from = bi.current();
    harness.check(from, in.length());

    while (true) {
      int to = bi.previous();
      if (to == BreakIterator.DONE) break;
      harness.check(in.substring(to, from), out[index]);
      --index;
      from = to;
    }

    harness.check(index, -1);
  }
예제 #2
0
  @Override
  public Object evaluate(DeferredObject[] arguments) throws HiveException {
    assert (arguments.length >= 1 && arguments.length <= 3);
    if (arguments[0].get() == null) {
      return null;
    }

    // if there is more than 1 argument specified, a different natural language
    // locale is being specified
    Locale locale = null;
    if (arguments.length > 1 && arguments[1].get() != null) {
      Text language = (Text) converters[1].convert(arguments[1].get());
      Text country = null;
      if (arguments.length > 2 && arguments[2].get() != null) {
        country = (Text) converters[2].convert(arguments[2].get());
      }
      if (country != null) {
        locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase());
      } else {
        locale = new Locale(language.toString().toLowerCase());
      }
    } else {
      locale = Locale.getDefault();
    }

    // get the input and prepare the output
    Text chunk = (Text) converters[0].convert(arguments[0].get());
    String text = chunk.toString();
    ArrayList<ArrayList<Text>> result = new ArrayList<ArrayList<Text>>();

    // Parse out sentences using Java's text-handling API
    BreakIterator bi = BreakIterator.getSentenceInstance(locale);
    bi.setText(text);
    int idx = 0;
    while (bi.next() != BreakIterator.DONE) {
      String sentence = text.substring(idx, bi.current());
      idx = bi.current();
      result.add(new ArrayList<Text>());

      // Parse out words in the sentence
      BreakIterator wi = BreakIterator.getWordInstance(locale);
      wi.setText(sentence);
      int widx = 0;
      ArrayList<Text> sent_array = result.get(result.size() - 1);
      while (wi.next() != BreakIterator.DONE) {
        String word = sentence.substring(widx, wi.current());
        widx = wi.current();
        if (Character.isLetterOrDigit(word.charAt(0))) {
          sent_array.add(new Text(word));
        }
      }
    }

    return result;
  }
  /**
   * Called to summarize a document when no hits were found. By default this just returns the first
   * {@code maxPassages} sentences; subclasses can override to customize.
   */
  protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
    // BreakIterator should be un-next'd:
    List<Passage> passages = new ArrayList<>();
    int pos = bi.current();
    assert pos == 0;
    while (passages.size() < maxPassages) {
      int next = bi.next();
      if (next == BreakIterator.DONE) {
        break;
      }
      Passage passage = new Passage();
      passage.score = Float.NaN;
      passage.startOffset = pos;
      passage.endOffset = next;
      passages.add(passage);
      pos = next;
    }

    return passages.toArray(new Passage[passages.size()]);
  }
 /** Returns the next word in the text */
 public String nextWord() {
   if (!first) {
     currentWordPos = nextWordPos;
     currentWordEnd = getNextWordEnd(text, currentWordPos);
     nextWordPos = getNextWordStart(text, currentWordEnd + 1);
     int current = sentanceIterator.current();
     if (current == currentWordPos) startsSentance = true;
     else {
       startsSentance = false;
       if (currentWordEnd > current) sentanceIterator.next();
     }
   }
   // The nextWordPos has already been populated
   String word = null;
   try {
     word = document.getText(currentWordPos, currentWordEnd - currentWordPos);
   } catch (BadLocationException ex) {
     moreTokens = false;
   }
   wordCount++;
   first = false;
   if (nextWordPos == -1) moreTokens = false;
   return word;
 }
예제 #5
0
  public static void main(String[] args) {
    BreakIterator bi = BreakIterator.getWordInstance();
    bi.setText(text);
    MirroredBreakIterator mirror = new MirroredBreakIterator(bi);
    final int first = bi.first();
    if (first != 0) {
      throw new RuntimeException("first != 0: " + first);
    }
    final int last = bi.last();
    bi = BreakIterator.getWordInstance();
    bi.setText(text);
    int length = text.length();

    /*
     * following(int)
     */
    for (int i = 0; i <= length; i++) {
      if (i == length) {
        check(bi.following(i), DONE);
      }
      check(bi.following(i), mirror.following(i));
      check(bi.current(), mirror.current());
    }
    for (int i = -length; i < 0; i++) {
      checkFollowingException(bi, i);
      checkFollowingException(mirror, i);
      check(bi.current(), mirror.current());
    }
    for (int i = 1; i < length; i++) {
      checkFollowingException(bi, length + i);
      checkFollowingException(mirror, length + i);
      check(bi.current(), mirror.current());
    }

    /*
     * preceding(int)
     */
    for (int i = length; i >= 0; i--) {
      if (i == 0) {
        check(bi.preceding(i), DONE);
      }
      check(bi.preceding(i), mirror.preceding(i));
      check(bi.current(), mirror.current());
    }
    for (int i = -length; i < 0; i++) {
      checkPrecedingException(bi, i);
      checkPrecedingException(mirror, i);
      check(bi.current(), mirror.current());
    }
    for (int i = 1; i < length; i++) {
      checkPrecedingException(bi, length + i);
      checkPrecedingException(mirror, length + i);
      check(bi.current(), mirror.current());
    }

    /*
     * isBoundary(int)
     */
    for (int i = 0; i <= length; i++) {
      check(bi.isBoundary(i), mirror.isBoundary(i));
      check(bi.current(), mirror.current());
    }
    for (int i = -length; i < 0; i++) {
      checkIsBoundaryException(bi, i);
      checkIsBoundaryException(mirror, i);
    }
    for (int i = 1; i < length; i++) {
      checkIsBoundaryException(bi, length + i);
      checkIsBoundaryException(mirror, length + i);
    }
  }