public Job getJob(Configuration conf, String input, String output) throws TupleMRException, IOException { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(output), true); List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("word", Type.STRING)); fields.add(Field.create("count", Type.INT)); Schema schema = new Schema("schema", fields); TupleMRBuilder cg = new TupleMRBuilder(conf, "Pangool WordCount"); cg.addIntermediateSchema(schema); cg.setGroupByFields("word"); cg.setJarByClass(PangoolWordCount.class); cg.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new Split()); cg.setOutput( new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, Text.class); cg.setTupleReducer(new Count()); cg.setTupleCombiner(new CountCombiner()); return cg.createJob(); }
@Override public int run( Path outputPath, Map<String, Path> parsedInputs, Map<String, Object> parsedParameters) throws Exception { mr = new TupleMRBuilder(hadoopConf, getName()); mr.setJarByClass(this.getClass()); configure(parsedParameters); for (Map.Entry<String, RichInput> inputEntry : bindedInputs.entrySet()) { RichInput input = inputEntry.getValue(); String inputName = inputEntry.getKey(); if (input instanceof HadoopInput) { HadoopInput hadoopInput = (HadoopInput) input; mr.addInput( parsedInputs.get(inputName), hadoopInput.getFormat(), hadoopInput.getProcessor()); for (Schema schema : hadoopInput.getIntermediateSchemas()) { mr.addIntermediateSchema(schema); } } else if (input instanceof TupleInput) { TupleInput tupleInput = (TupleInput) input; mr.addTupleInput(parsedInputs.get(inputName), tupleInput.getProcessor()); for (Schema schema : tupleInput.getIntermediateSchemas()) { mr.addIntermediateSchema(schema); } } } mr.setTupleReducer(reducer); if (combiner != null) { mr.setTupleCombiner(combiner); } if (jobOutput instanceof HadoopOutput) { HadoopOutput hadoopOutput = (HadoopOutput) jobOutput; mr.setOutput( outputPath, hadoopOutput.getOutputFormat(), hadoopOutput.getKey(), hadoopOutput.getValue()); } else if (jobOutput instanceof TupleOutput) { TupleOutput tupleOutput = (TupleOutput) jobOutput; mr.setTupleOutput(outputPath, tupleOutput.getOutputSchema()); } for (Map.Entry<String, RichOutput> namedOutputEntry : bindedOutputs.entrySet()) { RichOutput output = namedOutputEntry.getValue(); String outputName = namedOutputEntry.getKey(); if (output instanceof HadoopOutput) { HadoopOutput hadoopOutput = (HadoopOutput) output; mr.addNamedOutput( outputName, hadoopOutput.getOutputFormat(), hadoopOutput.getKey(), hadoopOutput.getValue()); } else if (output instanceof TupleOutput) { TupleOutput tupleOutput = (TupleOutput) output; mr.addNamedTupleOutput(outputName, tupleOutput.getOutputSchema()); } } mr.setGroupByFields(groupBy.groupByFields); if (orderBy != null) { mr.setOrderBy(orderBy); } return executeCoGrouper(mr); }