/** * The reduce function which packages the key and List<Tuple> into key, Bag<Tuple> * after converting Hadoop type key into Pig type. The package result is either collected as is, * if the reduce plan is empty or after passing through the reduce plan. */ @Override protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context) throws IOException, InterruptedException { if (!initialized) { initialized = true; // cache the collector for use in runPipeline() // which could additionally be called from close() this.outputCollector = context; pigReporter.setRep(context); PhysicalOperator.setReporter(pigReporter); boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning")); PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance(); pigStatusReporter.setContext(new MRTaskContext(context)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); pigHadoopLogger.setReporter(pigStatusReporter); pigHadoopLogger.setAggregate(aggregateWarning); PhysicalOperator.setPigLogger(pigHadoopLogger); if (!inIllustrator) for (POStore store : stores) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } } // In the case we optimize the join, we combine // POPackage and POForeach - so we could get many // tuples out of the getnext() call of POJoinPackage // In this case, we process till we see EOP from // POJoinPacakage.getNext() if (pack.getPkgr() instanceof JoinPackager) { pack.attachInput(key, tupIter.iterator()); while (true) { if (processOnePackageOutput(context)) break; } } else { // join is not optimized, so package will // give only one tuple out for the key pack.attachInput(key, tupIter.iterator()); processOnePackageOutput(context); } }
public String exec(Tuple input) throws IOException { if (input == null || input.size() == 0) return null; try { DataBag bag = (DataBag) input.get(0); StringBuilder sb = new StringBuilder(); for (Iterator<Tuple> iter = bag.iterator(); iter.hasNext(); ) { Tuple next = iter.next(); sb.append(next.get(0).toString()); if (iter.hasNext()) sb.append(" "); } return sb.toString(); } catch (Exception e) { System.err.println("StrJoin_Exception " + e.getClass().getName()); if (PigStatusReporter.getInstance() != null) { PigStatusReporter.getInstance() .getCounter("StrJoin_Exception", e.getClass().getName()) .increment(1); } return null; } }
@Override public Tuple exec(Tuple input) throws IOException { myreporter = PigStatusReporter.getInstance(); if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; DocumentMetadata dm = null; String title = null; String doi = null; String year = null; try { dba = (DataByteArray) input.get(0); } catch (Exception e) { myreporter.getCounter("extraction problems", "DataByteArray from tuple"); return null; } try { dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata(); } catch (Exception e) { myreporter.getCounter("extraction problems", "document metadata"); return null; } try { for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) { if (twl.getLanguage().toLowerCase().startsWith("en")) { title = twl.getText(); break; } } if (title == null) { title = dm.getBasicMetadata().getTitle(0).getText(); } if (title != null && !title.trim().isEmpty()) { title = DiacriticsRemover.removeDiacritics(title); title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim(); } } catch (Exception e) { } finally { if (title == null || title.trim().isEmpty()) { myreporter.getCounter("extraction problems", "title extraction"); return null; } } try { doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (doi == null || doi.trim().isEmpty()) { myreporter.getCounter("extraction problems", "doi extraction"); return null; } } try { year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (year == null || year.trim().isEmpty()) { myreporter.getCounter("extraction problems", "year extraction"); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); t.append(doi); t.append(year); t.append(title); return t; } catch (Exception e) { logger.debug(StackTraceExtractor.getStackTrace(e)); throw new IOException(e); } }
/** * The reduce function which packages the key and List<Tuple> into key, Bag<Tuple> * after converting Hadoop type key into Pig type. The package result is either collected as is, * if the reduce plan is empty or after passing through the reduce plan. */ @Override protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context) throws IOException, InterruptedException { if (!initialized) { initialized = true; // cache the collector for use in runPipeline() // which could additionally be called from close() this.outputCollector = context; pigReporter.setRep(context); PhysicalOperator.setReporter(pigReporter); boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning")); PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance(); pigStatusReporter.setContext(new MRTaskContext(context)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); pigHadoopLogger.setReporter(pigStatusReporter); pigHadoopLogger.setAggregate(aggregateWarning); PhysicalOperator.setPigLogger(pigHadoopLogger); for (POStore store : stores) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } } // If the keyType is not a tuple, the MapWithComparator.collect() // would have wrapped the key into a tuple so that the // comparison UDF used in the order by can process it. // We need to unwrap the key out of the tuple and hand it // to the POPackage for processing if (keyType != DataType.TUPLE) { Tuple t = (Tuple) (key.getValueAsPigType()); try { key = HDataType.getWritableComparableTypes(t.get(0), keyType); } catch (ExecException e) { throw e; } } pack.attachInput(key, tupIter.iterator()); Result res = pack.getNextTuple(); if (res.returnStatus == POStatus.STATUS_OK) { Tuple packRes = (Tuple) res.result; if (rp.isEmpty()) { context.write(null, packRes); return; } rp.attachInput(packRes); List<PhysicalOperator> leaves = rp.getLeaves(); PhysicalOperator leaf = leaves.get(0); runPipeline(leaf); } if (res.returnStatus == POStatus.STATUS_NULL) { return; } if (res.returnStatus == POStatus.STATUS_ERR) { int errCode = 2093; String msg = "Encountered error in package operator while processing group."; throw new ExecException(msg, errCode, PigException.BUG); } }