@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } Object obj = null; Integer limnum = null; try { obj = (DataByteArray) input.get(1); } catch (ExecException e) { logger.error("Error in reading field proto:", e); throw e; } try { limnum = (Integer) input.get(2); } catch (ExecException e) { logger.error("Error in reading baglimit:", e); throw e; } DataByteArray dba = null; try { dba = (DataByteArray) obj; } catch (ClassCastException e) { logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e); throw e; } DocumentMetadata dm = null; try { dm = DocumentMetadata.parseFrom(dba.get()); } catch (InvalidProtocolBufferException e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } String key = dm.getKey(); DataBag db = new DefaultDataBag(); int bagsize = 0; for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) { for (String co_str : code.getValueList()) { bagsize++; db.add(TupleFactory.getInstance().newTuple(co_str)); } } if (bagsize > limnum) { Object[] to = new Object[] {key, db, bagsize}; return TupleFactory.getInstance().newTuple(Arrays.asList(to)); } return null; }
private Pair<String, DataBag> extractLangKeywords(DocumentMetadata dm) { List<String> kws = new ArrayList<String>(); Set<String> ctgs = new HashSet<String>(); for (KeywordsList kwl : dm.getKeywordsList()) { if (language.equalsIgnoreCase(kwl.getLanguage())) { for (String str : kwl.getKeywordsList()) { if (isClassifCode(str)) { ctgs.add(str); continue; } if (action == Action.TRANSLATE) { str = translateNonAlphaNumeric(str); } else if (action == Action.REMOVE_KEYCHARACTERS) { str = removeAllKeyPunctations(str); } else { str = removeAllNonAlphaNumeric(str); } kws.add(str); } } } for (ClassifCode cc : dm.getBasicMetadata().getClassifCodeList()) { for (String s : cc.getValueList()) { ctgs.add(s); } } DataBag db = new DefaultDataBag(); for (String s : ctgs) { db.add(TupleFactory.getInstance().newTuple(s)); } return new Pair<String, DataBag>(Joiner.on(" ").join(kws), db); }