Example #1
0
 public String exec(Tuple input) throws IOException {
   if (input == null || input.size() == 0) return null;
   try {
     DataBag bag = (DataBag) input.get(0);
     StringBuilder sb = new StringBuilder();
     for (Iterator<Tuple> iter = bag.iterator(); iter.hasNext(); ) {
       Tuple next = iter.next();
       sb.append(next.get(0).toString());
       if (iter.hasNext()) sb.append(" ");
     }
     return sb.toString();
   } catch (Exception e) {
     System.err.println("StrJoin_Exception " + e.getClass().getName());
     if (PigStatusReporter.getInstance() != null) {
       PigStatusReporter.getInstance()
           .getCounter("StrJoin_Exception", e.getClass().getName())
           .increment(1);
     }
     return null;
   }
 }
    /**
     * The reduce function which packages the key and List&lt;Tuple&gt; into key, Bag&lt;Tuple&gt;
     * after converting Hadoop type key into Pig type. The package result is either collected as is,
     * if the reduce plan is empty or after passing through the reduce plan.
     */
    @Override
    protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context)
        throws IOException, InterruptedException {

      if (!initialized) {
        initialized = true;

        // cache the collector for use in runPipeline()
        // which could additionally be called from close()
        this.outputCollector = context;
        pigReporter.setRep(context);
        PhysicalOperator.setReporter(pigReporter);

        boolean aggregateWarning =
            "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning"));
        PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
        pigStatusReporter.setContext(new MRTaskContext(context));
        PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
        pigHadoopLogger.setReporter(pigStatusReporter);
        pigHadoopLogger.setAggregate(aggregateWarning);
        PhysicalOperator.setPigLogger(pigHadoopLogger);

        if (!inIllustrator)
          for (POStore store : stores) {
            MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context);
            store.setStoreImpl(impl);
            store.setUp();
          }
      }

      // In the case we optimize the join, we combine
      // POPackage and POForeach - so we could get many
      // tuples out of the getnext() call of POJoinPackage
      // In this case, we process till we see EOP from
      // POJoinPacakage.getNext()
      if (pack.getPkgr() instanceof JoinPackager) {
        pack.attachInput(key, tupIter.iterator());
        while (true) {
          if (processOnePackageOutput(context)) break;
        }
      } else {
        // join is not optimized, so package will
        // give only one tuple out for the key
        pack.attachInput(key, tupIter.iterator());
        processOnePackageOutput(context);
      }
    }
  @Override
  public Tuple exec(Tuple input) throws IOException {

    myreporter = PigStatusReporter.getInstance();

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      DocumentMetadata dm = null;
      String title = null;
      String doi = null;
      String year = null;

      try {
        dba = (DataByteArray) input.get(0);
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "DataByteArray from tuple");
        return null;
      }

      try {
        dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata();
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "document metadata");
        return null;
      }

      try {
        for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) {
          if (twl.getLanguage().toLowerCase().startsWith("en")) {
            title = twl.getText();

            break;
          }
        }
        if (title == null) {
          title = dm.getBasicMetadata().getTitle(0).getText();
        }
        if (title != null && !title.trim().isEmpty()) {
          title = DiacriticsRemover.removeDiacritics(title);
          title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim();
        }
      } catch (Exception e) {
      } finally {
        if (title == null || title.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "title extraction");
          return null;
        }
      }

      try {
        doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (doi == null || doi.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "doi extraction");
          return null;
        }
      }

      try {
        year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (year == null || year.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "year extraction");
          return null;
        }
      }

      Tuple t = TupleFactory.getInstance().newTuple();
      t.append(doi);
      t.append(year);
      t.append(title);

      return t;
    } catch (Exception e) {
      logger.debug(StackTraceExtractor.getStackTrace(e));
      throw new IOException(e);
    }
  }
    /**
     * The reduce function which packages the key and List&lt;Tuple&gt; into key, Bag&lt;Tuple&gt;
     * after converting Hadoop type key into Pig type. The package result is either collected as is,
     * if the reduce plan is empty or after passing through the reduce plan.
     */
    @Override
    protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context)
        throws IOException, InterruptedException {

      if (!initialized) {
        initialized = true;

        // cache the collector for use in runPipeline()
        // which could additionally be called from close()
        this.outputCollector = context;
        pigReporter.setRep(context);
        PhysicalOperator.setReporter(pigReporter);

        boolean aggregateWarning =
            "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning"));
        PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
        pigStatusReporter.setContext(new MRTaskContext(context));
        PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
        pigHadoopLogger.setReporter(pigStatusReporter);
        pigHadoopLogger.setAggregate(aggregateWarning);
        PhysicalOperator.setPigLogger(pigHadoopLogger);

        for (POStore store : stores) {
          MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context);
          store.setStoreImpl(impl);
          store.setUp();
        }
      }

      // If the keyType is not a tuple, the MapWithComparator.collect()
      // would have wrapped the key into a tuple so that the
      // comparison UDF used in the order by can process it.
      // We need to unwrap the key out of the tuple and hand it
      // to the POPackage for processing
      if (keyType != DataType.TUPLE) {
        Tuple t = (Tuple) (key.getValueAsPigType());
        try {
          key = HDataType.getWritableComparableTypes(t.get(0), keyType);
        } catch (ExecException e) {
          throw e;
        }
      }

      pack.attachInput(key, tupIter.iterator());

      Result res = pack.getNextTuple();
      if (res.returnStatus == POStatus.STATUS_OK) {
        Tuple packRes = (Tuple) res.result;

        if (rp.isEmpty()) {
          context.write(null, packRes);
          return;
        }

        rp.attachInput(packRes);

        List<PhysicalOperator> leaves = rp.getLeaves();

        PhysicalOperator leaf = leaves.get(0);
        runPipeline(leaf);
      }

      if (res.returnStatus == POStatus.STATUS_NULL) {
        return;
      }

      if (res.returnStatus == POStatus.STATUS_ERR) {
        int errCode = 2093;
        String msg = "Encountered error in package operator while processing group.";
        throw new ExecException(msg, errCode, PigException.BUG);
      }
    }