/** * The reduce function which packages the key and List<Tuple> into key, Bag<Tuple> * after converting Hadoop type key into Pig type. The package result is either collected as is, * if the reduce plan is empty or after passing through the reduce plan. */ @Override protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context) throws IOException, InterruptedException { if (!initialized) { initialized = true; // cache the collector for use in runPipeline() // which could additionally be called from close() this.outputCollector = context; pigReporter.setRep(context); PhysicalOperator.setReporter(pigReporter); boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning")); PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance(); pigStatusReporter.setContext(new MRTaskContext(context)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); pigHadoopLogger.setReporter(pigStatusReporter); pigHadoopLogger.setAggregate(aggregateWarning); PhysicalOperator.setPigLogger(pigHadoopLogger); if (!inIllustrator) for (POStore store : stores) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } } // In the case we optimize the join, we combine // POPackage and POForeach - so we could get many // tuples out of the getnext() call of POJoinPackage // In this case, we process till we see EOP from // POJoinPacakage.getNext() if (pack.getPkgr() instanceof JoinPackager) { pack.attachInput(key, tupIter.iterator()); while (true) { if (processOnePackageOutput(context)) break; } } else { // join is not optimized, so package will // give only one tuple out for the key pack.attachInput(key, tupIter.iterator()); processOnePackageOutput(context); } }
/** * Will be called once all the intermediate keys and values are processed. So right place to * stop the reporter thread. */ @Override protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); if (errorInReduce) { // there was an error in reduce - just return return; } if (PigMapReduce.sJobConfInternal.get().get("pig.stream.in.reduce", "false").equals("true")) { // If there is a stream in the pipeline we could // potentially have more to process - so lets // set the flag stating that all map input has been sent // already and then lets run the pipeline one more time // This will result in nothing happening in the case // where there is no stream in the pipeline rp.endOfAllInput = true; runPipeline(leaf); } if (!inIllustrator) { for (POStore store : stores) { if (!initialized) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } store.tearDown(); } } // Calling EvalFunc.finish() UDFFinishVisitor finisher = new UDFFinishVisitor(rp, new DependencyOrderWalker<PhysicalOperator, PhysicalPlan>(rp)); try { finisher.visit(); } catch (VisitorException e) { throw new IOException("Error trying to finish UDFs", e); } PhysicalOperator.setReporter(null); initialized = false; }
/** * The reduce function which packages the key and List<Tuple> into key, Bag<Tuple> * after converting Hadoop type key into Pig type. The package result is either collected as is, * if the reduce plan is empty or after passing through the reduce plan. */ @Override protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context) throws IOException, InterruptedException { if (!initialized) { initialized = true; // cache the collector for use in runPipeline() // which could additionally be called from close() this.outputCollector = context; pigReporter.setRep(context); PhysicalOperator.setReporter(pigReporter); boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning")); PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance(); pigStatusReporter.setContext(new MRTaskContext(context)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); pigHadoopLogger.setReporter(pigStatusReporter); pigHadoopLogger.setAggregate(aggregateWarning); PhysicalOperator.setPigLogger(pigHadoopLogger); for (POStore store : stores) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } } // If the keyType is not a tuple, the MapWithComparator.collect() // would have wrapped the key into a tuple so that the // comparison UDF used in the order by can process it. // We need to unwrap the key out of the tuple and hand it // to the POPackage for processing if (keyType != DataType.TUPLE) { Tuple t = (Tuple) (key.getValueAsPigType()); try { key = HDataType.getWritableComparableTypes(t.get(0), keyType); } catch (ExecException e) { throw e; } } pack.attachInput(key, tupIter.iterator()); Result res = pack.getNextTuple(); if (res.returnStatus == POStatus.STATUS_OK) { Tuple packRes = (Tuple) res.result; if (rp.isEmpty()) { context.write(null, packRes); return; } rp.attachInput(packRes); List<PhysicalOperator> leaves = rp.getLeaves(); PhysicalOperator leaf = leaves.get(0); runPipeline(leaf); } if (res.returnStatus == POStatus.STATUS_NULL) { return; } if (res.returnStatus == POStatus.STATUS_ERR) { int errCode = 2093; String msg = "Encountered error in package operator while processing group."; throw new ExecException(msg, errCode, PigException.BUG); } }