@Override
  public Object evaluate(DeferredObject[] arguments) throws HiveException {
    assert (arguments.length >= 1 && arguments.length <= 3);
    if (arguments[0].get() == null) {
      return null;
    }

    // if there is more than 1 argument specified, a different natural language
    // locale is being specified
    Locale locale = null;
    if (arguments.length > 1 && arguments[1].get() != null) {
      Text language = (Text) converters[1].convert(arguments[1].get());
      Text country = null;
      if (arguments.length > 2 && arguments[2].get() != null) {
        country = (Text) converters[2].convert(arguments[2].get());
      }
      if (country != null) {
        locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase());
      } else {
        locale = new Locale(language.toString().toLowerCase());
      }
    } else {
      locale = Locale.getDefault();
    }

    // get the input and prepare the output
    Text chunk = (Text) converters[0].convert(arguments[0].get());
    String text = chunk.toString();
    ArrayList<ArrayList<Text>> result = new ArrayList<ArrayList<Text>>();

    // Parse out sentences using Java's text-handling API
    BreakIterator bi = BreakIterator.getSentenceInstance(locale);
    bi.setText(text);
    int idx = 0;
    while (bi.next() != BreakIterator.DONE) {
      String sentence = text.substring(idx, bi.current());
      idx = bi.current();
      result.add(new ArrayList<Text>());

      // Parse out words in the sentence
      BreakIterator wi = BreakIterator.getWordInstance(locale);
      wi.setText(sentence);
      int widx = 0;
      ArrayList<Text> sent_array = result.get(result.size() - 1);
      while (wi.next() != BreakIterator.DONE) {
        String word = sentence.substring(widx, wi.current());
        widx = wi.current();
        if (Character.isLetterOrDigit(word.charAt(0))) {
          sent_array.add(new Text(word));
        }
      }
    }

    return result;
  }
  static void parseWordDataQ1(String inputText) {

    Locale currentLocale = new Locale("en", "US");
    BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(currentLocale);

    ArrayList<Integer> alist = findSentenceBoundaries(inputText, sentenceIterator);

    BreakIterator wordIterator = BreakIterator.getWordInstance(currentLocale);

    HashMap<String, WordTuple> wordList = findWordsInSentences(inputText, wordIterator, alist);
    printProcessedData(alist, wordList);
  }
  public void testSentenceBoundary() throws Exception {
    StringBuilder text = new StringBuilder(TEXT);
    // we test this with default locale, it's randomized by LuceneTestCase
    BreakIterator bi = BreakIterator.getSentenceInstance(Locale.getDefault());
    BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);

    int start = TEXT.indexOf("any application");
    int expected = TEXT.indexOf("It is a");
    testFindStartOffset(text, start, expected, scanner);

    expected = TEXT.indexOf("Apache Lucene is an open source");
    testFindEndOffset(text, start, expected, scanner);
  }
 /**
  * Extract out sentences from the reviews. to take into account the negative lists Later Check
  * Stanford Document tokenizer
  */
 private void BreakInLines() {
   // this.Lines = review.split(". ");
   BreakIterator border = BreakIterator.getSentenceInstance(Locale.US);
   border.setText(review);
   //	System.out.println(review);
   int start = border.first();
   // iterate, creating sentences out of all the Strings between the given boundaries
   for (int end = border.next(); end != BreakIterator.DONE; start = end, end = border.next()) {
     // System.out.println(review.substring(start,end));
     Lines.add(review.substring(start, end));
     NumOfSentences++;
   }
   // System.out.println(NumOfSentences);
 }
示例#5
0
 static String extractShortDescription(String description) {
   if (description == null) {
     return null;
   }
   int dot = description.indexOf(".");
   if (dot != -1) {
     BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.US);
     breakIterator.setText(description);
     String text = description.substring(breakIterator.first(), breakIterator.next()).trim();
     return removeSpaceBetweenLine(text);
   } else {
     String[] lines = description.split(NEW_LINE);
     return lines[0].trim();
   }
 }
示例#6
0
  @Test
  public void testSentenceDetection() {
    BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.US);

    sentenceIterator.setText(TEST_STRING);

    int start = sentenceIterator.first();
    int end = -1;

    List<String> sentenceList = new ArrayList<String>();

    while ((end = sentenceIterator.next()) != BreakIterator.DONE) {
      String sentence = TEST_STRING.substring(start, end);
      start = end;
      sentenceList.add(sentence);

      System.out.println("Sentence: " + sentence);
    }
  }
 public DocumentWordTokenizer(Document document) {
   this.document = document;
   // Create a text segment over the etire document
   text = new Segment();
   sentanceIterator = BreakIterator.getSentenceInstance();
   try {
     document.getText(0, document.getLength(), text);
     sentanceIterator.setText(text);
     currentWordPos = getNextWordStart(text, 0);
     // If the current word pos is -1 then the string was all white space
     if (currentWordPos != -1) {
       currentWordEnd = getNextWordEnd(text, currentWordPos);
       nextWordPos = getNextWordStart(text, currentWordEnd);
     } else {
       moreTokens = false;
     }
   } catch (BadLocationException ex) {
     moreTokens = false;
   }
 }
  @Override
  protected BoundaryScanner get(String fieldName, SolrParams params) {
    // construct Locale
    String language = params.getFieldParam(fieldName, HighlightParams.BS_LANGUAGE);
    String country = params.getFieldParam(fieldName, HighlightParams.BS_COUNTRY);
    if (country != null && language == null) {
      throw new SolrException(
          ErrorCode.BAD_REQUEST,
          HighlightParams.BS_LANGUAGE
              + " parameter cannot be null when you specify "
              + HighlightParams.BS_COUNTRY);
    }
    Locale locale = null;
    if (language != null) {
      locale = country == null ? new Locale(language) : new Locale(language, country);
    } else {
      locale = Locale.ROOT;
    }

    // construct BreakIterator
    String type =
        params.getFieldParam(fieldName, HighlightParams.BS_TYPE, "WORD").toLowerCase(Locale.ROOT);
    BreakIterator bi = null;
    if (type.equals("character")) {
      bi = BreakIterator.getCharacterInstance(locale);
    } else if (type.equals("word")) {
      bi = BreakIterator.getWordInstance(locale);
    } else if (type.equals("line")) {
      bi = BreakIterator.getLineInstance(locale);
    } else if (type.equals("sentence")) {
      bi = BreakIterator.getSentenceInstance(locale);
    } else
      throw new SolrException(
          ErrorCode.BAD_REQUEST, type + " is invalid for parameter " + HighlightParams.BS_TYPE);

    return new org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner(bi);
  }
 /**
  * Returns the {@link BreakIterator} to use for dividing text into passages. This returns {@link
  * BreakIterator#getSentenceInstance(Locale)} by default; subclasses can override to customize.
  */
 protected BreakIterator getBreakIterator(String field) {
   return BreakIterator.getSentenceInstance(Locale.ROOT);
 }
示例#10
0
  /**
   * Initializes fields comment, inlineTags of the object
   *
   * @param commentText the processed comment text
   */
  private void procComment(String commentText) {
    // initialize inlineTags
    ArrayList<Tag> result = new ArrayList<Tag>();
    String noInlineTags = replaceAtSigns(commentText);

    /*
     * Pattern p = Pattern.compile("\\{\\s*@[^}]*\\}"); // matches inline
     * tags // Pattern p =
     * Pattern.compile("\\{\\s*@([^\\s\\}]+)\\s*([^\\}]*)\\s*}"); // matches
     * inline tags Matcher m = p.matcher(commentText); int start = 0, end =
     * 0; // create an array of tag objects of kind "Text" and "@link"; as
     * explained in the // doclet API, for a comment // This is a {@link Doc
     * commentlabel} example. // create an array of Tag objects: // *
     * tags[0] is a Tag with name "Text" and text consisting of "This is a "
     * // * tags[1] is a SeeTag with name "@link", and label "commentlabel"
     * // * tags[2] is a Tag with name "Text" and text consisting of
     * " example." while (m.find()) { end = m.start(); String linkText =
     * m.group(); // System.out.print("String = \"" +
     * commentText.substring(start, end)); //
     * System.out.println("\"; linkText = \"" + linkText + "\""); //
     * result.add(new X10Tag("Text", commentText.substring(start, end),
     * this)); result.add(X10Tag.processInlineTag(linkText, this)); //int
     * index = commentText.indexOf(linkText); //commentText =
     * commentText.substring(0, index) + commentText.substring(index +
     * linkText.length()); // result.add(new X10SeeTag(true, linkText,
     * this)); // "true" signifies an @link tag, as opposed to an @see tag
     * start = m.end(); }
     */
    if (!commentText.startsWith("@")) { // make sure that there is a
      // beginning paragraph
      // initialize comment
      int blockTagStart = noInlineTags.indexOf("@"); // start of block
      // tags within
      // comment
      blockTagStart = (blockTagStart == -1) ? commentText.length() : blockTagStart;
      this.comment = commentText.substring(0, blockTagStart).trim();
      if (!comment.equals("")) {
        result.addAll(createInlineTags(comment, this));
      }

      // }
      // add constraints, if any
      // String decl = declString();
      // if (decl != null) {
      // result.add(new X10Tag(decl, this));
      // }

      // initialize firstSentenceTags
      BreakIterator b = BreakIterator.getSentenceInstance();
      b.setText(comment);
      int start = 0;
      int end = 0;
      start = b.first();
      end = b.next();
      String firstSentence = ((start <= end) ? comment.substring(start, end).trim() : "");
      // System.out.println("X10Doc.initializeFields(): firstSentence = \""
      // + firstSentence + "\"");
      firstSentenceTags = createInlineTags(firstSentence, this).toArray(new X10Tag[0]);

    } else {
      firstSentenceTags = new X10Tag[0];
    }

    inlineTags = result.toArray(new X10Tag[0]);

    // TODO: creating Tag objects for block tags and storing them in a field
    // of this object
    Pattern blockTagPattern = Pattern.compile("\\s*@[^@]*");
    Matcher blockTagMatcher = blockTagPattern.matcher(noInlineTags);
    while (blockTagMatcher.find()) {
      String tagText = blockTagMatcher.group();
      int start = blockTagMatcher.start();
      processBlockTag(commentText.substring(start, start + tagText.length()));
    }
  }