@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } Object obj = null; Integer limnum = null; try { obj = (DataByteArray) input.get(1); } catch (ExecException e) { logger.error("Error in reading field proto:", e); throw e; } try { limnum = (Integer) input.get(2); } catch (ExecException e) { logger.error("Error in reading baglimit:", e); throw e; } DataByteArray dba = null; try { dba = (DataByteArray) obj; } catch (ClassCastException e) { logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e); throw e; } DocumentMetadata dm = null; try { dm = DocumentMetadata.parseFrom(dba.get()); } catch (InvalidProtocolBufferException e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } String key = dm.getKey(); DataBag db = new DefaultDataBag(); int bagsize = 0; for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) { for (String co_str : code.getValueList()) { bagsize++; db.add(TupleFactory.getInstance().newTuple(co_str)); } } if (bagsize > limnum) { Object[] to = new Object[] {key, db, bagsize}; return TupleFactory.getInstance().newTuple(Arrays.asList(to)); } return null; }
@Override public Map exec(Tuple input) throws IOException { try { DataByteArray protoMetadata = (DataByteArray) input.get(0); int lim = (Integer) input.get(1); DocumentMetadata metadata = DocumentMetadata.parseFrom(protoMetadata.get()); if (language != null) { return generateConcreteLanguageMap(metadata, lim); } else { return generateAllLanguageMap(metadata, lim); } } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException( "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; try { dba = (DataByteArray) input.get(0); } catch (ExecException e) { logger.error("Error in reading field:", e); throw e; } DocumentWrapper dm = null; try { dm = DocumentWrapper.parseFrom(dba.get()); } catch (Exception e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } DataBag ret = new DefaultDataBag(); DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray()); List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList(); for (int i = 0; i < authors.size(); i++) { String sname = authors.get(i).getSurname(); Object[] to = new Object[] {sname, metadata, i}; Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to)); ret.add(t); } return ret; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException( "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
protected static HyperLogLogPlus countDisctinct(Tuple input, int p) throws NumberFormatException, IOException { HyperLogLogPlus estimator = new HyperLogLogPlus(p); DataBag values = (DataBag) input.get(0); for (Iterator<Tuple> it = values.iterator(); it.hasNext(); ) { Tuple t = it.next(); Object data = t.get(0); if (data instanceof Long) { estimator.offerHashed((Long) data); } else if (data instanceof DataByteArray) { DataByteArray bytes = (DataByteArray) data; HyperLogLogPlus newEstimator; try { newEstimator = HyperLogLogPlus.Builder.build(bytes.get()); estimator = (HyperLogLogPlus) estimator.merge(newEstimator); } catch (IOException e) { throw new RuntimeException(e); } catch (CardinalityMergeException e) { throw new RuntimeException(e); } } } return estimator; }
@Override public byte[] toBytes(DataByteArray a) throws IOException { return a.get(); }
@Override public Tuple exec(Tuple input) throws IOException { myreporter = PigStatusReporter.getInstance(); if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; DocumentMetadata dm = null; String title = null; String doi = null; String year = null; try { dba = (DataByteArray) input.get(0); } catch (Exception e) { myreporter.getCounter("extraction problems", "DataByteArray from tuple"); return null; } try { dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata(); } catch (Exception e) { myreporter.getCounter("extraction problems", "document metadata"); return null; } try { for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) { if (twl.getLanguage().toLowerCase().startsWith("en")) { title = twl.getText(); break; } } if (title == null) { title = dm.getBasicMetadata().getTitle(0).getText(); } if (title != null && !title.trim().isEmpty()) { title = DiacriticsRemover.removeDiacritics(title); title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim(); } } catch (Exception e) { } finally { if (title == null || title.trim().isEmpty()) { myreporter.getCounter("extraction problems", "title extraction"); return null; } } try { doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (doi == null || doi.trim().isEmpty()) { myreporter.getCounter("extraction problems", "doi extraction"); return null; } } try { year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (year == null || year.trim().isEmpty()) { myreporter.getCounter("extraction problems", "year extraction"); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); t.append(doi); t.append(year); t.append(title); return t; } catch (Exception e) { logger.debug(StackTraceExtractor.getStackTrace(e)); throw new IOException(e); } }