Esempio n. 1
0
  /**
   * stolen from JobControlCompiler TODO: refactor it to share this
   *
   * @param physicalPlan
   * @param poLoad
   * @param jobConf
   * @return
   * @throws java.io.IOException
   */
  private static JobConf configureLoader(PhysicalPlan physicalPlan, POLoad poLoad, JobConf jobConf)
      throws IOException {

    // 这部分似乎没用
    Job job = new Job(jobConf);
    LoadFunc loadFunc = poLoad.getLoadFunc();
    loadFunc.setLocation(poLoad.getLFile().getFileName(), job);

    // stolen from JobControlCompiler
    ArrayList<FileSpec> pigInputs = new ArrayList<FileSpec>();
    // Store the inp filespecs
    pigInputs.add(poLoad.getLFile());

    ArrayList<List<OperatorKey>> inpTargets = Lists.newArrayList();
    ArrayList<String> inpSignatures = Lists.newArrayList();
    ArrayList<Long> inpLimits = Lists.newArrayList();

    // Store the target operators for tuples read
    // from this input
    List<PhysicalOperator> loadSuccessors = physicalPlan.getSuccessors(poLoad);
    List<OperatorKey> loadSuccessorsKeys = Lists.newArrayList();
    if (loadSuccessors != null) {
      for (PhysicalOperator loadSuccessor : loadSuccessors) {
        loadSuccessorsKeys.add(loadSuccessor.getOperatorKey());
      }
    }

    inpTargets.add(loadSuccessorsKeys);
    inpSignatures.add(poLoad.getSignature());
    inpLimits.add(poLoad.getLimit());

    jobConf.set("pig.inputs", ObjectSerializer.serialize(pigInputs));
    jobConf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets));
    jobConf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatures));
    jobConf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits));

    return jobConf;
  }
Esempio n. 2
0
  /** NewHadoopRDD */
  @Override
  public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad) throws IOException {
    if (predecessorRdds.size() != 0) {
      throw new RuntimeException(
          "Should not have predecessorRdds for Load. Got : " + predecessorRdds);
    }

    JobConf loadJobConf = SparkUtil.newJobConf(pigContext);
    configureLoader(physicalPlan, poLoad, loadJobConf);

    RDD<Tuple2<Text, Tuple>> hadoopRDD =
        sparkContext.newAPIHadoopFile(
            poLoad.getLFile().getFileName(),
            PigInputFormat.class, // InputFormat class
            Text.class, // K class
            Tuple.class, // V class
            loadJobConf);

    // map to get just RDD<Tuple>
    return hadoopRDD.map(TO_TUPLE_FUNCTION, ScalaUtil.getClassTag(Tuple.class));
  }