@Override public Object evaluate(DeferredObject[] arguments) throws HiveException { assert (arguments.length >= 1 && arguments.length <= 3); if (arguments[0].get() == null) { return null; } // if there is more than 1 argument specified, a different natural language // locale is being specified Locale locale = null; if (arguments.length > 1 && arguments[1].get() != null) { Text language = (Text) converters[1].convert(arguments[1].get()); Text country = null; if (arguments.length > 2 && arguments[2].get() != null) { country = (Text) converters[2].convert(arguments[2].get()); } if (country != null) { locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase()); } else { locale = new Locale(language.toString().toLowerCase()); } } else { locale = Locale.getDefault(); } // get the input and prepare the output Text chunk = (Text) converters[0].convert(arguments[0].get()); String text = chunk.toString(); ArrayList<ArrayList<Text>> result = new ArrayList<ArrayList<Text>>(); // Parse out sentences using Java's text-handling API BreakIterator bi = BreakIterator.getSentenceInstance(locale); bi.setText(text); int idx = 0; while (bi.next() != BreakIterator.DONE) { String sentence = text.substring(idx, bi.current()); idx = bi.current(); result.add(new ArrayList<Text>()); // Parse out words in the sentence BreakIterator wi = BreakIterator.getWordInstance(locale); wi.setText(sentence); int widx = 0; ArrayList<Text> sent_array = result.get(result.size() - 1); while (wi.next() != BreakIterator.DONE) { String word = sentence.substring(widx, wi.current()); widx = wi.current(); if (Character.isLetterOrDigit(word.charAt(0))) { sent_array.add(new Text(word)); } } } return result; }
static void parseWordDataQ1(String inputText) { Locale currentLocale = new Locale("en", "US"); BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(currentLocale); ArrayList<Integer> alist = findSentenceBoundaries(inputText, sentenceIterator); BreakIterator wordIterator = BreakIterator.getWordInstance(currentLocale); HashMap<String, WordTuple> wordList = findWordsInSentences(inputText, wordIterator, alist); printProcessedData(alist, wordList); }
public void testSentenceBoundary() throws Exception { StringBuilder text = new StringBuilder(TEXT); // we test this with default locale, it's randomized by LuceneTestCase BreakIterator bi = BreakIterator.getSentenceInstance(Locale.getDefault()); BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); int start = TEXT.indexOf("any application"); int expected = TEXT.indexOf("It is a"); testFindStartOffset(text, start, expected, scanner); expected = TEXT.indexOf("Apache Lucene is an open source"); testFindEndOffset(text, start, expected, scanner); }
/** * Extract out sentences from the reviews. to take into account the negative lists Later Check * Stanford Document tokenizer */ private void BreakInLines() { // this.Lines = review.split(". "); BreakIterator border = BreakIterator.getSentenceInstance(Locale.US); border.setText(review); // System.out.println(review); int start = border.first(); // iterate, creating sentences out of all the Strings between the given boundaries for (int end = border.next(); end != BreakIterator.DONE; start = end, end = border.next()) { // System.out.println(review.substring(start,end)); Lines.add(review.substring(start, end)); NumOfSentences++; } // System.out.println(NumOfSentences); }
static String extractShortDescription(String description) { if (description == null) { return null; } int dot = description.indexOf("."); if (dot != -1) { BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.US); breakIterator.setText(description); String text = description.substring(breakIterator.first(), breakIterator.next()).trim(); return removeSpaceBetweenLine(text); } else { String[] lines = description.split(NEW_LINE); return lines[0].trim(); } }
@Test public void testSentenceDetection() { BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.US); sentenceIterator.setText(TEST_STRING); int start = sentenceIterator.first(); int end = -1; List<String> sentenceList = new ArrayList<String>(); while ((end = sentenceIterator.next()) != BreakIterator.DONE) { String sentence = TEST_STRING.substring(start, end); start = end; sentenceList.add(sentence); System.out.println("Sentence: " + sentence); } }
public DocumentWordTokenizer(Document document) { this.document = document; // Create a text segment over the etire document text = new Segment(); sentanceIterator = BreakIterator.getSentenceInstance(); try { document.getText(0, document.getLength(), text); sentanceIterator.setText(text); currentWordPos = getNextWordStart(text, 0); // If the current word pos is -1 then the string was all white space if (currentWordPos != -1) { currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd); } else { moreTokens = false; } } catch (BadLocationException ex) { moreTokens = false; } }
@Override protected BoundaryScanner get(String fieldName, SolrParams params) { // construct Locale String language = params.getFieldParam(fieldName, HighlightParams.BS_LANGUAGE); String country = params.getFieldParam(fieldName, HighlightParams.BS_COUNTRY); if (country != null && language == null) { throw new SolrException( ErrorCode.BAD_REQUEST, HighlightParams.BS_LANGUAGE + " parameter cannot be null when you specify " + HighlightParams.BS_COUNTRY); } Locale locale = null; if (language != null) { locale = country == null ? new Locale(language) : new Locale(language, country); } else { locale = Locale.ROOT; } // construct BreakIterator String type = params.getFieldParam(fieldName, HighlightParams.BS_TYPE, "WORD").toLowerCase(Locale.ROOT); BreakIterator bi = null; if (type.equals("character")) { bi = BreakIterator.getCharacterInstance(locale); } else if (type.equals("word")) { bi = BreakIterator.getWordInstance(locale); } else if (type.equals("line")) { bi = BreakIterator.getLineInstance(locale); } else if (type.equals("sentence")) { bi = BreakIterator.getSentenceInstance(locale); } else throw new SolrException( ErrorCode.BAD_REQUEST, type + " is invalid for parameter " + HighlightParams.BS_TYPE); return new org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner(bi); }
/** * Returns the {@link BreakIterator} to use for dividing text into passages. This returns {@link * BreakIterator#getSentenceInstance(Locale)} by default; subclasses can override to customize. */ protected BreakIterator getBreakIterator(String field) { return BreakIterator.getSentenceInstance(Locale.ROOT); }
/** * Initializes fields comment, inlineTags of the object * * @param commentText the processed comment text */ private void procComment(String commentText) { // initialize inlineTags ArrayList<Tag> result = new ArrayList<Tag>(); String noInlineTags = replaceAtSigns(commentText); /* * Pattern p = Pattern.compile("\\{\\s*@[^}]*\\}"); // matches inline * tags // Pattern p = * Pattern.compile("\\{\\s*@([^\\s\\}]+)\\s*([^\\}]*)\\s*}"); // matches * inline tags Matcher m = p.matcher(commentText); int start = 0, end = * 0; // create an array of tag objects of kind "Text" and "@link"; as * explained in the // doclet API, for a comment // This is a {@link Doc * commentlabel} example. // create an array of Tag objects: // * * tags[0] is a Tag with name "Text" and text consisting of "This is a " * // * tags[1] is a SeeTag with name "@link", and label "commentlabel" * // * tags[2] is a Tag with name "Text" and text consisting of * " example." while (m.find()) { end = m.start(); String linkText = * m.group(); // System.out.print("String = \"" + * commentText.substring(start, end)); // * System.out.println("\"; linkText = \"" + linkText + "\""); // * result.add(new X10Tag("Text", commentText.substring(start, end), * this)); result.add(X10Tag.processInlineTag(linkText, this)); //int * index = commentText.indexOf(linkText); //commentText = * commentText.substring(0, index) + commentText.substring(index + * linkText.length()); // result.add(new X10SeeTag(true, linkText, * this)); // "true" signifies an @link tag, as opposed to an @see tag * start = m.end(); } */ if (!commentText.startsWith("@")) { // make sure that there is a // beginning paragraph // initialize comment int blockTagStart = noInlineTags.indexOf("@"); // start of block // tags within // comment blockTagStart = (blockTagStart == -1) ? commentText.length() : blockTagStart; this.comment = commentText.substring(0, blockTagStart).trim(); if (!comment.equals("")) { result.addAll(createInlineTags(comment, this)); } // } // add constraints, if any // String decl = declString(); // if (decl != null) { // result.add(new X10Tag(decl, this)); // } // initialize firstSentenceTags BreakIterator b = BreakIterator.getSentenceInstance(); b.setText(comment); int start = 0; int end = 0; start = b.first(); end = b.next(); String firstSentence = ((start <= end) ? comment.substring(start, end).trim() : ""); // System.out.println("X10Doc.initializeFields(): firstSentence = \"" // + firstSentence + "\""); firstSentenceTags = createInlineTags(firstSentence, this).toArray(new X10Tag[0]); } else { firstSentenceTags = new X10Tag[0]; } inlineTags = result.toArray(new X10Tag[0]); // TODO: creating Tag objects for block tags and storing them in a field // of this object Pattern blockTagPattern = Pattern.compile("\\s*@[^@]*"); Matcher blockTagMatcher = blockTagPattern.matcher(noInlineTags); while (blockTagMatcher.find()) { String tagText = blockTagMatcher.group(); int start = blockTagMatcher.start(); processBlockTag(commentText.substring(start, start + tagText.length())); } }