/** Given two vertices a, b update their configurations to be used in an Edge a-b */ public void updateConfigurationForEdge(JobConf vConf, Vertex v, JobConf wConf, Vertex w) throws IOException { // Tez needs to setup output subsequent input pairs correctly MultiStageMRConfToTezTranslator.translateVertexConfToTez(wConf, vConf); // update payloads (configuration for the vertices might have changed) v.getProcessorDescriptor().setUserPayload(MRHelpers.createUserPayloadFromConf(vConf)); w.getProcessorDescriptor().setUserPayload(MRHelpers.createUserPayloadFromConf(wConf)); }
/* * Helper function to create Vertex for given ReduceWork. */ private Vertex createVertex( JobConf conf, ReduceWork reduceWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx) throws Exception { // set up operator plan Utilities.setReduceWork(conf, reduceWork, mrScratchDir, false); // create the directories FileSinkOperators need Utilities.createTmpDirs(conf, reduceWork); // Call once here, will be updated when we find edges MultiStageMRConfToTezTranslator.translateVertexConfToTez(conf, null); // create the vertex Vertex reducer = new Vertex( reduceWork.getName(), new ProcessorDescriptor(ReduceTezProcessor.class.getName()) .setUserPayload(MRHelpers.createUserPayloadFromConf(conf)), reduceWork.getNumReduceTasks(), getContainerResource(conf)); Map<String, String> environment = new HashMap<String, String>(); MRHelpers.updateEnvironmentForMRTasks(conf, environment, false); reducer.setTaskEnvironment(environment); reducer.setJavaOpts(getContainerJavaOpts(conf)); Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); localResources.put(getBaseName(appJarLr), appJarLr); for (LocalResource lr : additionalLr) { localResources.put(getBaseName(lr), lr); } reducer.setTaskLocalResources(localResources); return reducer; }
/* * Helper function to create Vertex from MapWork. */ private Vertex createVertex( JobConf conf, MapWork mapWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, TezWork tezWork) throws Exception { Path tezDir = getTezDir(mrScratchDir); // set up the operator plan Utilities.setMapWork(conf, mapWork, mrScratchDir, false); // create the directories FileSinkOperators need Utilities.createTmpDirs(conf, mapWork); // Tez ask us to call this even if there's no preceding vertex MultiStageMRConfToTezTranslator.translateVertexConfToTez(conf, null); // finally create the vertex Vertex map = null; // use tez to combine splits boolean useTezGroupedSplits = false; int numTasks = -1; Class amSplitGeneratorClass = null; InputSplitInfo inputSplitInfo = null; Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class); boolean vertexHasCustomInput = false; if (tezWork != null) { for (BaseWork baseWork : tezWork.getParents(mapWork)) { if (tezWork.getEdgeType(baseWork, mapWork) == EdgeType.CUSTOM_EDGE) { vertexHasCustomInput = true; } } } if (vertexHasCustomInput) { useTezGroupedSplits = false; // grouping happens in execution phase. Setting the class to TezGroupedSplitsInputFormat // here would cause pre-mature grouping which would be incorrect. inputFormatClass = HiveInputFormat.class; conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class); // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using // this plug-in to avoid getting a serialized event at run-time. conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false); } else { // we'll set up tez to combine spits for us iff the input format // is HiveInputFormat if (inputFormatClass == HiveInputFormat.class) { useTezGroupedSplits = true; conf.setClass( "mapred.input.format.class", TezGroupedSplitsInputFormat.class, InputFormat.class); } } if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) { // if we're generating the splits in the AM, we just need to set // the correct plugin. amSplitGeneratorClass = MRInputAMSplitGenerator.class; } else { // client side split generation means we have to compute them now inputSplitInfo = MRHelpers.generateInputSplits( conf, new Path(tezDir, "split_" + mapWork.getName().replaceAll(" ", "_"))); numTasks = inputSplitInfo.getNumTasks(); } byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf); map = new Vertex( mapWork.getName(), new ProcessorDescriptor(MapTezProcessor.class.getName()).setUserPayload(serializedConf), numTasks, getContainerResource(conf)); Map<String, String> environment = new HashMap<String, String>(); MRHelpers.updateEnvironmentForMRTasks(conf, environment, true); map.setTaskEnvironment(environment); map.setJavaOpts(getContainerJavaOpts(conf)); assert mapWork.getAliasToWork().keySet().size() == 1; String alias = mapWork.getAliasToWork().keySet().iterator().next(); byte[] mrInput = null; if (useTezGroupedSplits) { mrInput = MRHelpers.createMRInputPayloadWithGrouping( serializedConf, HiveInputFormat.class.getName()); } else { mrInput = MRHelpers.createMRInputPayload(serializedConf, null); } map.addInput( alias, new InputDescriptor(MRInputLegacy.class.getName()).setUserPayload(mrInput), amSplitGeneratorClass); Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); localResources.put(getBaseName(appJarLr), appJarLr); for (LocalResource lr : additionalLr) { localResources.put(getBaseName(lr), lr); } if (inputSplitInfo != null) { // only relevant for client-side split generation map.setTaskLocationsHint(inputSplitInfo.getTaskLocationHints()); MRHelpers.updateLocalResourcesForInputSplits( FileSystem.get(conf), inputSplitInfo, localResources); } map.setTaskLocalResources(localResources); return map; }