public DataBag exec(Tuple input) throws IOException { try { if (!input.isNull()) { // Create the output like a databag {(res1,res2),(res3,res4)..} DataBag output_databag = mBagFactory.newDefaultBag(); // Unpack tuple in order to get the bag {(1,2),(3,4),...} String input_time = (String) input.get(0); try { DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss"); Date date = formatter.parse( String.format( "%s/%s/%s %s:%s:%s", input_time.substring(5, 7), input_time.substring(8, 10), input_time.substring(0, 4), input_time.substring(11, 13), input_time.substring(14, 16), input_time.substring(17, 18))); Calendar calendar = Calendar.getInstance(); calendar.setTime(date); int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK); int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH); int hour = calendar.get(Calendar.HOUR_OF_DAY); // Add items to output Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour)); output_databag.add(items); } catch (Exception e) { Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #1" + e.getMessage()); output_databag.add(items); return output_databag; } return output_databag; } else { DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #2"); output_databag.add(items); return output_databag; } } catch (Exception e) { System.err.println("Error with ?? .."); DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #3" + e.getMessage()); output_databag.add(items); return output_databag; } }
@Override public String exec(Tuple input) throws IOException { String output = ""; if (input != null && !input.isNull()) { String _html = (String) input.get(0); if (_html != null && !_html.isEmpty()) { output = _html; int start_index = _html.indexOf("<body"); start_index += 5; int stop_index = _html.indexOf("</body"); if (start_index >= 0 && start_index < _html.length() && stop_index >= start_index && stop_index < _html.length()) output = _html.substring(start_index, stop_index); Pattern REMOVE_TAGS = Pattern.compile("<.+?>"); Matcher m = REMOVE_TAGS.matcher(output); output = m.replaceAll(""); REMOVE_TAGS = Pattern.compile("<!--(.+?)-->"); m = REMOVE_TAGS.matcher(output); output = m.replaceAll(""); /* boolean intag = false; String inp = output; String outp = ""; for (int i=0; i < inp.length(); ++i) { if (!intag && inp.charAt(i) == '<') { intag = true; continue; } if (intag && inp.charAt(i) == '>') { intag = false; continue; } if (!intag) { outp = outp + inp.charAt(i); } } output = outp; // output = output.replaceAll("\\s+",""); */ } } return output; }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; }
@Override public boolean isNull(int fieldNum) throws ExecException { return t.isNull(fieldNum); }