@Override
  public DataBag exec(Tuple input) throws IOException {

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      try {
        dba = (DataByteArray) input.get(0);
      } catch (ExecException e) {
        logger.error("Error in reading field:", e);
        throw e;
      }

      DocumentWrapper dm = null;
      try {
        dm = DocumentWrapper.parseFrom(dba.get());
      } catch (Exception e) {
        logger.error("Error in reading ByteArray to DocumentMetadata:", e);
        throw e;
      }

      DataBag ret = new DefaultDataBag();
      DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray());

      List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList();

      for (int i = 0; i < authors.size(); i++) {
        String sname = authors.get(i).getSurname();
        Object[] to = new Object[] {sname, metadata, i};
        Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to));
        ret.add(t);
      }

      return ret;

    } catch (Exception e) {
      logger.error("Error in processing input row:", e);
      throw new IOException(
          "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e));
    }
  }
예제 #2
0
  @Override
  public Tuple exec(Tuple input) throws IOException {

    myreporter = PigStatusReporter.getInstance();

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      DocumentMetadata dm = null;
      String title = null;
      String doi = null;
      String year = null;

      try {
        dba = (DataByteArray) input.get(0);
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "DataByteArray from tuple");
        return null;
      }

      try {
        dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata();
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "document metadata");
        return null;
      }

      try {
        for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) {
          if (twl.getLanguage().toLowerCase().startsWith("en")) {
            title = twl.getText();

            break;
          }
        }
        if (title == null) {
          title = dm.getBasicMetadata().getTitle(0).getText();
        }
        if (title != null && !title.trim().isEmpty()) {
          title = DiacriticsRemover.removeDiacritics(title);
          title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim();
        }
      } catch (Exception e) {
      } finally {
        if (title == null || title.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "title extraction");
          return null;
        }
      }

      try {
        doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (doi == null || doi.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "doi extraction");
          return null;
        }
      }

      try {
        year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (year == null || year.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "year extraction");
          return null;
        }
      }

      Tuple t = TupleFactory.getInstance().newTuple();
      t.append(doi);
      t.append(year);
      t.append(title);

      return t;
    } catch (Exception e) {
      logger.debug(StackTraceExtractor.getStackTrace(e));
      throw new IOException(e);
    }
  }