@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; try { dba = (DataByteArray) input.get(0); } catch (ExecException e) { logger.error("Error in reading field:", e); throw e; } DocumentWrapper dm = null; try { dm = DocumentWrapper.parseFrom(dba.get()); } catch (Exception e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } DataBag ret = new DefaultDataBag(); DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray()); List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList(); for (int i = 0; i < authors.size(); i++) { String sname = authors.get(i).getSurname(); Object[] to = new Object[] {sname, metadata, i}; Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to)); ret.add(t); } return ret; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException( "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
@Override public Tuple exec(Tuple input) throws IOException { myreporter = PigStatusReporter.getInstance(); if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; DocumentMetadata dm = null; String title = null; String doi = null; String year = null; try { dba = (DataByteArray) input.get(0); } catch (Exception e) { myreporter.getCounter("extraction problems", "DataByteArray from tuple"); return null; } try { dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata(); } catch (Exception e) { myreporter.getCounter("extraction problems", "document metadata"); return null; } try { for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) { if (twl.getLanguage().toLowerCase().startsWith("en")) { title = twl.getText(); break; } } if (title == null) { title = dm.getBasicMetadata().getTitle(0).getText(); } if (title != null && !title.trim().isEmpty()) { title = DiacriticsRemover.removeDiacritics(title); title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim(); } } catch (Exception e) { } finally { if (title == null || title.trim().isEmpty()) { myreporter.getCounter("extraction problems", "title extraction"); return null; } } try { doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (doi == null || doi.trim().isEmpty()) { myreporter.getCounter("extraction problems", "doi extraction"); return null; } } try { year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (year == null || year.trim().isEmpty()) { myreporter.getCounter("extraction problems", "year extraction"); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); t.append(doi); t.append(year); t.append(title); return t; } catch (Exception e) { logger.debug(StackTraceExtractor.getStackTrace(e)); throw new IOException(e); } }