@Override
  public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    Object obj = null;
    Integer limnum = null;
    try {
      obj = (DataByteArray) input.get(1);

    } catch (ExecException e) {
      logger.error("Error in reading field proto:", e);
      throw e;
    }

    try {
      limnum = (Integer) input.get(2);
    } catch (ExecException e) {
      logger.error("Error in reading baglimit:", e);
      throw e;
    }

    DataByteArray dba = null;
    try {
      dba = (DataByteArray) obj;
    } catch (ClassCastException e) {
      logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e);
      throw e;
    }

    DocumentMetadata dm = null;
    try {
      dm = DocumentMetadata.parseFrom(dba.get());
    } catch (InvalidProtocolBufferException e) {
      logger.error("Error in reading ByteArray to DocumentMetadata:", e);
      throw e;
    }

    String key = dm.getKey();
    DataBag db = new DefaultDataBag();
    int bagsize = 0;
    for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) {
      for (String co_str : code.getValueList()) {
        bagsize++;
        db.add(TupleFactory.getInstance().newTuple(co_str));
      }
    }
    if (bagsize > limnum) {
      Object[] to = new Object[] {key, db, bagsize};
      return TupleFactory.getInstance().newTuple(Arrays.asList(to));
    }
    return null;
  }
  @Override
  public Map exec(Tuple input) throws IOException {
    try {
      DataByteArray protoMetadata = (DataByteArray) input.get(0);
      int lim = (Integer) input.get(1);
      DocumentMetadata metadata = DocumentMetadata.parseFrom(protoMetadata.get());

      if (language != null) {
        return generateConcreteLanguageMap(metadata, lim);
      } else {
        return generateAllLanguageMap(metadata, lim);
      }
    } catch (Exception e) {
      logger.error("Error in processing input row:", e);
      throw new IOException(
          "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e));
    }
  }
  @Override
  public DataBag exec(Tuple input) throws IOException {

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      try {
        dba = (DataByteArray) input.get(0);
      } catch (ExecException e) {
        logger.error("Error in reading field:", e);
        throw e;
      }

      DocumentWrapper dm = null;
      try {
        dm = DocumentWrapper.parseFrom(dba.get());
      } catch (Exception e) {
        logger.error("Error in reading ByteArray to DocumentMetadata:", e);
        throw e;
      }

      DataBag ret = new DefaultDataBag();
      DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray());

      List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList();

      for (int i = 0; i < authors.size(); i++) {
        String sname = authors.get(i).getSurname();
        Object[] to = new Object[] {sname, metadata, i};
        Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to));
        ret.add(t);
      }

      return ret;

    } catch (Exception e) {
      logger.error("Error in processing input row:", e);
      throw new IOException(
          "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e));
    }
  }
 protected static HyperLogLogPlus countDisctinct(Tuple input, int p)
     throws NumberFormatException, IOException {
   HyperLogLogPlus estimator = new HyperLogLogPlus(p);
   DataBag values = (DataBag) input.get(0);
   for (Iterator<Tuple> it = values.iterator(); it.hasNext(); ) {
     Tuple t = it.next();
     Object data = t.get(0);
     if (data instanceof Long) {
       estimator.offerHashed((Long) data);
     } else if (data instanceof DataByteArray) {
       DataByteArray bytes = (DataByteArray) data;
       HyperLogLogPlus newEstimator;
       try {
         newEstimator = HyperLogLogPlus.Builder.build(bytes.get());
         estimator = (HyperLogLogPlus) estimator.merge(newEstimator);
       } catch (IOException e) {
         throw new RuntimeException(e);
       } catch (CardinalityMergeException e) {
         throw new RuntimeException(e);
       }
     }
   }
   return estimator;
 }
 @Override
 public byte[] toBytes(DataByteArray a) throws IOException {
   return a.get();
 }
Exemplo n.º 6
0
  @Override
  public Tuple exec(Tuple input) throws IOException {

    myreporter = PigStatusReporter.getInstance();

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      DocumentMetadata dm = null;
      String title = null;
      String doi = null;
      String year = null;

      try {
        dba = (DataByteArray) input.get(0);
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "DataByteArray from tuple");
        return null;
      }

      try {
        dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata();
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "document metadata");
        return null;
      }

      try {
        for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) {
          if (twl.getLanguage().toLowerCase().startsWith("en")) {
            title = twl.getText();

            break;
          }
        }
        if (title == null) {
          title = dm.getBasicMetadata().getTitle(0).getText();
        }
        if (title != null && !title.trim().isEmpty()) {
          title = DiacriticsRemover.removeDiacritics(title);
          title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim();
        }
      } catch (Exception e) {
      } finally {
        if (title == null || title.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "title extraction");
          return null;
        }
      }

      try {
        doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (doi == null || doi.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "doi extraction");
          return null;
        }
      }

      try {
        year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (year == null || year.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "year extraction");
          return null;
        }
      }

      Tuple t = TupleFactory.getInstance().newTuple();
      t.append(doi);
      t.append(year);
      t.append(title);

      return t;
    } catch (Exception e) {
      logger.debug(StackTraceExtractor.getStackTrace(e));
      throw new IOException(e);
    }
  }