public Job getJob(Configuration conf, String input, String output)
      throws TupleMRException, IOException {
    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(output), true);

    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("word", Type.STRING));
    fields.add(Field.create("count", Type.INT));
    Schema schema = new Schema("schema", fields);

    TupleMRBuilder cg = new TupleMRBuilder(conf, "Pangool WordCount");
    cg.addIntermediateSchema(schema);
    cg.setGroupByFields("word");
    cg.setJarByClass(PangoolWordCount.class);
    cg.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new Split());
    cg.setOutput(
        new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, Text.class);
    cg.setTupleReducer(new Count());
    cg.setTupleCombiner(new CountCombiner());

    return cg.createJob();
  }
示例#2
0
  @Override
  public int run(
      Path outputPath, Map<String, Path> parsedInputs, Map<String, Object> parsedParameters)
      throws Exception {

    mr = new TupleMRBuilder(hadoopConf, getName());
    mr.setJarByClass(this.getClass());

    configure(parsedParameters);

    for (Map.Entry<String, RichInput> inputEntry : bindedInputs.entrySet()) {
      RichInput input = inputEntry.getValue();
      String inputName = inputEntry.getKey();
      if (input instanceof HadoopInput) {
        HadoopInput hadoopInput = (HadoopInput) input;
        mr.addInput(
            parsedInputs.get(inputName), hadoopInput.getFormat(), hadoopInput.getProcessor());
        for (Schema schema : hadoopInput.getIntermediateSchemas()) {
          mr.addIntermediateSchema(schema);
        }
      } else if (input instanceof TupleInput) {
        TupleInput tupleInput = (TupleInput) input;
        mr.addTupleInput(parsedInputs.get(inputName), tupleInput.getProcessor());
        for (Schema schema : tupleInput.getIntermediateSchemas()) {
          mr.addIntermediateSchema(schema);
        }
      }
    }

    mr.setTupleReducer(reducer);
    if (combiner != null) {
      mr.setTupleCombiner(combiner);
    }

    if (jobOutput instanceof HadoopOutput) {
      HadoopOutput hadoopOutput = (HadoopOutput) jobOutput;
      mr.setOutput(
          outputPath,
          hadoopOutput.getOutputFormat(),
          hadoopOutput.getKey(),
          hadoopOutput.getValue());
    } else if (jobOutput instanceof TupleOutput) {
      TupleOutput tupleOutput = (TupleOutput) jobOutput;
      mr.setTupleOutput(outputPath, tupleOutput.getOutputSchema());
    }

    for (Map.Entry<String, RichOutput> namedOutputEntry : bindedOutputs.entrySet()) {
      RichOutput output = namedOutputEntry.getValue();
      String outputName = namedOutputEntry.getKey();
      if (output instanceof HadoopOutput) {
        HadoopOutput hadoopOutput = (HadoopOutput) output;
        mr.addNamedOutput(
            outputName,
            hadoopOutput.getOutputFormat(),
            hadoopOutput.getKey(),
            hadoopOutput.getValue());
      } else if (output instanceof TupleOutput) {
        TupleOutput tupleOutput = (TupleOutput) output;
        mr.addNamedTupleOutput(outputName, tupleOutput.getOutputSchema());
      }
    }

    mr.setGroupByFields(groupBy.groupByFields);
    if (orderBy != null) {
      mr.setOrderBy(orderBy);
    }

    return executeCoGrouper(mr);
  }