示例#1
0
  @Test
  public void testBzipStoreInMultiQuery2() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input2.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    PigContext pigContext = pig.getPigContext();
    pigContext.getProperties().setProperty("output.compression.enabled", "true");
    pigContext
        .getProperties()
        .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");

    pig.setBatchOn();
    pig.registerQuery("a = load '" + inputFileName + "';");
    pig.registerQuery("store a into 'output2.bz2';");
    pig.registerQuery("store a into 'output2';");
    pig.executeBatch();

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }
示例#2
0
 private TezJob getJob(TezPlanContainerNode tezPlanNode, TezPlanContainer planContainer)
     throws JobCreationException {
   try {
     Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
     localResources.putAll(planContainer.getLocalResources());
     TezOperPlan tezPlan = tezPlanNode.getTezOperPlan();
     localResources.putAll(tezPlan.getExtraResources());
     String shipFiles = pigContext.getProperties().getProperty("pig.streaming.ship.files");
     if (shipFiles != null) {
       for (String file : shipFiles.split(",")) {
         TezResourceManager.getInstance().addTezResource(new File(file.trim()).toURI());
       }
     }
     String cacheFiles = pigContext.getProperties().getProperty("pig.streaming.cache.files");
     if (cacheFiles != null) {
       addCacheResources(cacheFiles.split(","));
     }
     for (Map.Entry<String, LocalResource> entry : localResources.entrySet()) {
       log.info("Local resource: " + entry.getKey());
     }
     DAG tezDag = buildDAG(tezPlanNode, localResources);
     tezDag.setDAGInfo(createDagInfo(TezScriptState.get().getScript()));
     // set Tez caller context
     // Reflection for the following code since it is only available since tez 0.8.1:
     // CallerContext context = CallerContext.create(ATSService.CallerContext,
     // ATSService.getPigAuditId(pigContext),
     //     ATSService.EntityType, "");
     // tezDag.setCallerContext(context);
     Class callerContextClass = null;
     try {
       callerContextClass = Class.forName("org.apache.tez.client.CallerContext");
     } catch (ClassNotFoundException e) {
       // If pre-Tez 0.8.1, skip setting CallerContext
     }
     if (callerContextClass != null) {
       Method builderBuildMethod =
           callerContextClass.getMethod(
               "create", String.class, String.class, String.class, String.class);
       Object context =
           builderBuildMethod.invoke(
               null,
               PigATSClient.CALLER_CONTEXT,
               PigATSClient.getPigAuditId(pigContext),
               PigATSClient.ENTITY_TYPE,
               "");
       Method dagSetCallerContext =
           tezDag.getClass().getMethod("setCallerContext", context.getClass());
       dagSetCallerContext.invoke(tezDag, context);
     }
     log.info("Total estimated parallelism is " + tezPlan.getEstimatedTotalParallelism());
     return new TezJob(tezConf, tezDag, localResources, tezPlan);
   } catch (Exception e) {
     int errCode = 2017;
     String msg = "Internal error creating job configuration.";
     throw new JobCreationException(msg, errCode, PigException.BUG, e);
   }
 }
示例#3
0
  @Test
  public void testDefaultParallel() throws Throwable {
    pc.defaultParallel = 100;

    String query = "a = load '/passwd';" + "b = group a by $0;" + "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);
    checkDefaultParallelResult(pp, pc);

    pc.defaultParallel = -1;
  }
  @Test
  public void testParserWithEscapeCharacters() throws Exception {

    // All the needed variables
    PigContext pigContext = new PigContext(ExecType.LOCAL, new Properties());
    PigServer pigServer = new PigServer(pigContext);
    pigContext.connect();

    String tempFile = this.prepareTempFile();

    String query = String.format("A = LOAD '%s' ;", Util.encodeEscape(tempFile));
    // Start the real parsing job
    {
      // Initial statement
      Util.buildLp(pigServer, query);
    }

    {
      // Normal condition
      String q = query + "B = filter A by $0 eq 'This is a test string' ;";
      checkParsedConstContent(pigServer, pigContext, q, "This is a test string");
    }

    {
      // single-quote condition
      String q = query + "B = filter A by $0 eq 'This is a test \\'string' ;";
      checkParsedConstContent(pigServer, pigContext, q, "This is a test 'string");
    }

    {
      // escaping dot
      // the reason we have 4 backslashes below is we really want to put two backslashes but
      // since this is to be represented in a Java String, we escape each backslash with one more
      // backslash - hence 4. In a pig script in a file, this would be
      // \\.string
      String q = query + "B = filter A by $0 eq 'This is a test \\\\.string' ;";
      checkParsedConstContent(pigServer, pigContext, q, "This is a test \\.string");
    }

    {
      // newline condition
      String q = query + "B = filter A by $0 eq 'This is a test \\nstring' ;";
      checkParsedConstContent(pigServer, pigContext, q, "This is a test \nstring");
    }

    {
      // Unicode
      String q = query + "B = filter A by $0 eq 'This is a test \\uD30C\\uC774string' ;";
      checkParsedConstContent(pigServer, pigContext, q, "This is a test \uD30C\uC774string");
    }
  }
 private void setDefaultTimeZone() {
   String dtzStr = pc.getProperties().getProperty("pig.datetime.default.tz");
   if (dtzStr != null && dtzStr.length() > 0) {
     currentDTZ = DateTimeZone.getDefault();
     DateTimeZone.setDefault(DateTimeZone.forID(dtzStr));
   }
 }
示例#6
0
 @BeforeClass
 public static void setUpBeforeClass() throws Exception {
   cluster = MiniCluster.buildCluster();
   pc = new PigContext(ExecType.LOCAL, new Properties());
   pcMR = new PigContext(ExecType.MAPREDUCE, cluster.getProperties());
   pc.connect();
 }
  private void checkParsedConstContent(
      PigServer pigServer, PigContext pigContext, String query, String expectedContent)
      throws Exception {
    pigContext.connect();
    LogicalPlan lp = Util.buildLp(pigServer, query + "store B into 'output';");
    // Digging down the tree
    Operator load = lp.getSources().get(0);
    Operator filter = lp.getSuccessors(load).get(0);
    LogicalExpressionPlan comparisonPlan = ((LOFilter) filter).getFilterPlan();
    List<Operator> comparisonPlanRoots = comparisonPlan.getSinks();
    Operator compRootOne = comparisonPlanRoots.get(0);
    Operator compRootTwo = comparisonPlanRoots.get(1);

    // Here is the actual check logic
    if (compRootOne instanceof ConstantExpression) {
      assertTrue(
          "Must be equal",
          ((String) ((ConstantExpression) compRootOne).getValue()).equals(expectedContent));
    }
    // If not left, it must be right.
    else {
      assertTrue(
          "Must be equal",
          ((String) ((ConstantExpression) compRootTwo).getValue()).equals(expectedContent));
    }
  }
示例#8
0
  @Test
  public void testDefaultParallelInSkewJoin() throws Throwable {
    // default_parallel is considered only at runtime, so here we only test requested parallel
    // more thorough tests can be found in TestNumberOfReducers.java
    String query =
        "a = load 'input';"
            + "b = load 'input';"
            + "c = join a by $0, b by $0 using 'skewed' parallel 100;"
            + "store c into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);

    // Get the skew join job
    Iterator<MapReduceOper> iter = mrPlan.getKeys().values().iterator();
    int counter = 0;
    while (iter.hasNext()) {
      MapReduceOper op = iter.next();
      counter++;
      if (op.isSkewedJoin()) {
        assertTrue(op.getRequestedParallelism() == 100);
      }
    }
    assertEquals(3, counter);

    pc.defaultParallel = -1;
  }
  @Override
  public StoreFuncInterface createStoreFunc(POStore store) throws IOException {

    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    StoreFuncInterface storeFunc = store.getStoreFunc();
    JobContext jc = HadoopShims.createJobContext(conf, new JobID());

    OutputFormat<?, ?> outputFormat = storeFunc.getOutputFormat();
    PigOutputFormat.setLocation(jc, store);
    context = HadoopShims.createTaskAttemptContext(conf, HadoopShims.getNewTaskAttemptID());
    PigOutputFormat.setLocation(context, store);

    try {
      outputFormat.checkOutputSpecs(jc);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }

    try {
      outputCommitter = outputFormat.getOutputCommitter(context);
      outputCommitter.setupJob(jc);
      outputCommitter.setupTask(context);
      writer = outputFormat.getRecordWriter(context);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
    storeFunc.prepareToWrite(writer);
    return storeFunc;
  }
示例#10
0
 public static LOLoad newLOLoad(
     FileSpec loader, LogicalSchema schema, LogicalPlan plan, Configuration conf) {
   LoadFunc loadFunc = null;
   if (loader != null) {
     loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loader.getFuncSpec());
   }
   return new LOLoad(loader, schema, plan, conf, loadFunc, "alias_newOperatorKey");
 }
示例#11
0
 {
   try {
     pigContext.connect();
   } catch (ExecException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
 }
    /** Configures the Reduce plan, the POPackage operator and the reporter thread */
    @SuppressWarnings("unchecked")
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      super.setup(context);
      inIllustrator = inIllustrator(context);
      if (inIllustrator) pack = getPack(context);
      Configuration jConf = context.getConfiguration();
      SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf));
      context
          .getConfiguration()
          .set(
              PigConstants.TASK_INDEX,
              Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
      sJobContext = context;
      sJobConfInternal.set(context.getConfiguration());
      sJobConf = context.getConfiguration();
      try {
        PigContext.setPackageImportList(
            (ArrayList<String>) ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext) ObjectSerializer.deserialize(jConf.get("pig.pigContext"));

        // This attempts to fetch all of the generated code from the distributed cache, and resolve
        // it
        SchemaTupleBackend.initialize(jConf, pigContext);

        if (rp == null)
          rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf.get("pig.reducePlan"));
        stores = PlanHelper.getPhysicalOperators(rp, POStore.class);

        if (!inIllustrator)
          pack = (POPackage) ObjectSerializer.deserialize(jConf.get("pig.reduce.package"));
        // To be removed
        if (rp.isEmpty()) log.debug("Reduce Plan empty!");
        else {
          ByteArrayOutputStream baos = new ByteArrayOutputStream();
          rp.explain(baos);
          log.debug(baos.toString());
        }
        pigReporter = new ProgressableReporter();
        if (!(rp.isEmpty())) {
          roots = rp.getRoots().toArray(new PhysicalOperator[1]);
          leaf = rp.getLeaves().get(0);
        }

        // Get the UDF specific context
        MapRedUtil.setupUDFContext(jConf);

      } catch (IOException ioe) {
        String msg = "Problem while configuring reduce plan.";
        throw new RuntimeException(msg, ioe);
      }

      log.info(
          "Aliases being processed per job phase (AliasName[line,offset]): "
              + jConf.get("pig.alias.location"));

      Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
    }
 static {
   pc = new PigContext();
   try {
     pc.connect();
   } catch (ExecException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   r = new Random(SEED);
 }
示例#14
0
 public static void oneTimeSetUp() throws Exception {
   cluster = MiniGenericCluster.buildCluster();
   pc = new PigContext(cluster.getExecType(), cluster.getProperties());
   try {
     pc.connect();
   } catch (ExecException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   GenPhyOp.setPc(pc);
   Util.copyFromLocalToCluster(cluster, "test/org/apache/pig/test/data/passwd", "/passwd");
 }
示例#15
0
  @Test
  public void testMergeJoin() throws Exception {
    String query =
        "a = load '/tmp/input1';"
            + "b = load '/tmp/input2';"
            + "c = join a by $0, b by $0 using 'merge';"
            + "store c into '/tmp/output1';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MRCompiler comp = new MRCompiler(pp, pc);
    comp.compile();
    MROperPlan mrp = comp.getMRPlan();
    assertTrue(mrp.size() == 2);

    MapReduceOper mrOp0 = mrp.getRoots().get(0);
    assertTrue(mrOp0.mapPlan.size() == 2);
    PhysicalOperator load0 = mrOp0.mapPlan.getRoots().get(0);
    MergeJoinIndexer func =
        (MergeJoinIndexer)
            PigContext.instantiateFuncFromSpec(((POLoad) load0).getLFile().getFuncSpec());
    Field lrField = MergeJoinIndexer.class.getDeclaredField("lr");
    lrField.setAccessible(true);
    POLocalRearrange lr = (POLocalRearrange) lrField.get(func);
    List<PhysicalPlan> innerPlans = lr.getPlans();
    PhysicalOperator localrearrange0 = mrOp0.mapPlan.getSuccessors(load0).get(0);
    assertTrue(localrearrange0 instanceof POLocalRearrange);
    assertTrue(mrOp0.reducePlan.size() == 3);
    PhysicalOperator pack0 = mrOp0.reducePlan.getRoots().get(0);
    assertTrue(pack0 instanceof POPackage);
    PhysicalOperator foreach0 = mrOp0.reducePlan.getSuccessors(pack0).get(0);
    assertTrue(foreach0 instanceof POForEach);
    PhysicalOperator store0 = mrOp0.reducePlan.getSuccessors(foreach0).get(0);
    assertTrue(store0 instanceof POStore);

    assertTrue(innerPlans.size() == 1);
    PhysicalPlan innerPlan = innerPlans.get(0);
    assertTrue(innerPlan.size() == 1);
    PhysicalOperator project = innerPlan.getRoots().get(0);
    assertTrue(project instanceof POProject);
    assertTrue(((POProject) project).getColumn() == 0);

    MapReduceOper mrOp1 = mrp.getSuccessors(mrOp0).get(0);
    assertTrue(mrOp1.mapPlan.size() == 3);
    PhysicalOperator load1 = mrOp1.mapPlan.getRoots().get(0);
    assertTrue(load1 instanceof POLoad);
    PhysicalOperator mergejoin1 = mrOp1.mapPlan.getSuccessors(load1).get(0);
    assertTrue(mergejoin1 instanceof POMergeJoin);
    PhysicalOperator store1 = mrOp1.mapPlan.getSuccessors(mergejoin1).get(0);
    assertTrue(store1 instanceof POStore);
    assertTrue(mrOp1.reducePlan.isEmpty());
  }
    /**
     * The reduce function which packages the key and List&lt;Tuple&gt; into key, Bag&lt;Tuple&gt;
     * after converting Hadoop type key into Pig type. The package result is either collected as is,
     * if the reduce plan is empty or after passing through the reduce plan.
     */
    @Override
    protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context)
        throws IOException, InterruptedException {

      if (!initialized) {
        initialized = true;

        // cache the collector for use in runPipeline()
        // which could additionally be called from close()
        this.outputCollector = context;
        pigReporter.setRep(context);
        PhysicalOperator.setReporter(pigReporter);

        boolean aggregateWarning =
            "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning"));
        PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
        pigStatusReporter.setContext(new MRTaskContext(context));
        PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
        pigHadoopLogger.setReporter(pigStatusReporter);
        pigHadoopLogger.setAggregate(aggregateWarning);
        PhysicalOperator.setPigLogger(pigHadoopLogger);

        if (!inIllustrator)
          for (POStore store : stores) {
            MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context);
            store.setStoreImpl(impl);
            store.setUp();
          }
      }

      // In the case we optimize the join, we combine
      // POPackage and POForeach - so we could get many
      // tuples out of the getnext() call of POJoinPackage
      // In this case, we process till we see EOP from
      // POJoinPacakage.getNext()
      if (pack.getPkgr() instanceof JoinPackager) {
        pack.attachInput(key, tupIter.iterator());
        while (true) {
          if (processOnePackageOutput(context)) break;
        }
      } else {
        // join is not optimized, so package will
        // give only one tuple out for the key
        pack.attachInput(key, tupIter.iterator());
        processOnePackageOutput(context);
      }
    }
 @Override
 public InputFormat getInputFormat() throws IOException {
   // if not manually set in options string
   if (inputFormatClassName == null) {
     if (loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) {
       inputFormatClassName = Bzip2TextInputFormat.class.getName();
     } else {
       inputFormatClassName = TextInputFormat.class.getName();
     }
   }
   try {
     return (FileInputFormat) PigContext.resolveClassName(inputFormatClassName).newInstance();
   } catch (InstantiationException e) {
     throw new IOException("Failed creating input format " + inputFormatClassName, e);
   } catch (IllegalAccessException e) {
     throw new IOException("Failed creating input format " + inputFormatClassName, e);
   }
 }
  /**
   * This must be called when the code has been generated and the generated code needs to be shipped
   * to the cluster, so that it may be used by the mappers and reducers.
   *
   * @param pigContext
   * @param conf
   */
  public static void copyAllGeneratedToDistributedCache(PigContext pigContext, Configuration conf) {
    if (stf == null) {
      LOG.debug("Nothing registered to generate.");
      return;
    }
    SchemaTupleFrontendGenHelper stfgh = new SchemaTupleFrontendGenHelper(pigContext, conf);
    stfgh.generateAll(stf.getSchemasToGenerate());
    stfgh.internalCopyAllGeneratedToDistributedCache();

    Properties prop = pigContext.getProperties();
    String value = conf.get(GENERATED_CLASSES_KEY);
    if (value != null) {
      prop.setProperty(GENERATED_CLASSES_KEY, value);
    } else {
      prop.remove(GENERATED_CLASSES_KEY);
    }
    value = conf.get(LOCAL_CODE_DIR);
    if (value != null) {
      prop.setProperty(LOCAL_CODE_DIR, value);
    } else {
      prop.remove(LOCAL_CODE_DIR);
    }
  }
      @Override
      protected void execute(LogicalExpression op) throws FrontendException {
        if (op instanceof UserFuncExpression) {
          UserFuncExpression udf = (UserFuncExpression) op;
          if (!udf.getEvalFunc().allowCompileTimeCalculation()) {
            return;
          }
        }
        boolean valSet = false;
        Object val = null;
        if (currentWalker.getPlan().getSuccessors(op) != null) {
          // If has successors and all successors are constant, calculate the constant
          for (Operator succ : currentWalker.getPlan().getSuccessors(op)) {
            if (!(succ instanceof ConstantExpression)) {
              return;
            }
          }
          // All successors are constant, calculate the value
          OperatorPlan expLogicalPlan = new LogicalExpressionPlan();
          ((BaseOperatorPlan) currentWalker.getPlan())
              .moveTree(op, (BaseOperatorPlan) expLogicalPlan);
          PhysicalPlan expPhysicalPlan = new PhysicalPlan();
          Map<Operator, PhysicalOperator> logToPhyMap = new HashMap<Operator, PhysicalOperator>();
          PlanWalker childWalker = new ReverseDependencyOrderWalkerWOSeenChk(expLogicalPlan);

          // Save the old walker and use childWalker as current Walker
          pushWalker(childWalker);
          ExpToPhyTranslationVisitor expTranslationVisitor =
              new ExpToPhyTranslationVisitor(
                  expLogicalPlan, childWalker, currentOp, expPhysicalPlan, logToPhyMap);
          expTranslationVisitor.visit();
          popWalker();
          PhysicalOperator root = expPhysicalPlan.getLeaves().get(0);
          try {
            UDFContext.getUDFContext()
                .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true));
            PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
            PhysicalOperator.setPigLogger(pigHadoopLogger);
            setDefaultTimeZone();
            val = root.getNext(root.getResultType()).result;
            restoreDefaultTimeZone();
            UDFContext.getUDFContext().addJobConf(null);
          } catch (ExecException e) {
            throw new FrontendException(e);
          }
          valSet = true;
        } else if (op instanceof UserFuncExpression) {
          // If solo UDF, calculate UDF
          UserFuncExpression udf = (UserFuncExpression) op;
          try {
            UDFContext.getUDFContext()
                .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true));
            setDefaultTimeZone();
            val = udf.getEvalFunc().exec(null);
            restoreDefaultTimeZone();
            UDFContext.getUDFContext().addJobConf(null);
          } catch (IOException e) {
            throw new FrontendException(e);
          }
          valSet = true;
        }
        if (valSet) {
          ConstantExpression constantExpr;
          constantExpr = new ConstantExpression(currentWalker.getPlan(), val);
          constantExpr.inheritSchema(op);
          currentWalker.getPlan().replace(op, constantExpr);
        }
      }
示例#20
0
  @Test
  public void testReducerNumEstimationForOrderBy() throws Exception {
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getProperties().setProperty("pig.exec.reducers.max", "10");

    String query = "a = load '/passwd';" + "b = order a by $0;" + "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);

    MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jobControl = jcc.compile(mrPlan, query);

    assertEquals(2, mrPlan.size());

    // first job uses a single reducer for the sampling
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    // Simulate the first job having run so estimation kicks in.
    MapReduceOper sort = mrPlan.getLeaves().get(0);
    jcc.updateMROpPlan(jobControl.getReadyJobs());
    FileLocalizer.create(sort.getQuantFile(), pc);
    jobControl = jcc.compile(mrPlan, query);

    sort = mrPlan.getLeaves().get(0);
    long reducer =
        Math.min(
            (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0),
            10);
    assertEquals(reducer, sort.getRequestedParallelism());

    // the second job estimates reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);

    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);
    assertEquals(2, sort.getRequestedParallelism());

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as
    // hbase
    query =
        "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');"
            + "b = order a by $0 ;"
            + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);

    // the requested parallel will be -1 if users don't set any of default_parallel, paralllel
    // and the estimation doesn't take effect. MR framework will finally set it to 1.
    assertEquals(-1, sort.getRequestedParallelism());

    // test order by with three jobs (after optimization)
    query =
        "a = load '/passwd';"
            + "b = foreach a generate $0, $1, $2;"
            + "c = order b by $0;"
            + "store c into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(3, mrPlan.size());

    // Simulate the first 2 jobs having run so estimation kicks in.
    sort = mrPlan.getLeaves().get(0);
    FileLocalizer.create(sort.getQuantFile(), pc);

    jobControl = jcc.compile(mrPlan, query);
    Util.copyFromLocalToCluster(
        cluster,
        "test/org/apache/pig/test/data/passwd",
        ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName());

    // First job is just foreach with projection, mapper-only job, so estimate gets ignored
    Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf());

    jcc.updateMROpPlan(jobControl.getReadyJobs());
    jobControl = jcc.compile(mrPlan, query);
    jcc.updateMROpPlan(jobControl.getReadyJobs());

    // Second job is a sampler, which requests and gets 1 reducer
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    jobControl = jcc.compile(mrPlan, query);
    sort = mrPlan.getLeaves().get(0);
    assertEquals(reducer, sort.getRequestedParallelism());

    // Third job is the order, which uses the estimated number of reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());
  }
示例#21
0
  @Test
  public void testReducerNumEstimation() throws Exception {
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    Configuration conf = HBaseConfiguration.create(new Configuration());
    HBaseTestingUtility util = new HBaseTestingUtility(conf);
    int clientPort = util.startMiniZKCluster().getClientPort();
    util.startMiniHBaseCluster(1, 1);

    String query = "a = load '/passwd';" + "b = group a by $0;" + "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");
    pc.getConf().setProperty(HConstants.ZOOKEEPER_CLIENT_PORT, Integer.toString(clientPort));
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jc = jcc.compile(mrPlan, "Test");
    Job job = jc.getWaitingJobs().get(0);
    long reducer =
        Math.min(
            (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0),
            10);

    Util.assertParallelValues(-1, -1, reducer, reducer, job.getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" + "b = group a by $0 PARALLEL 2;" + "store b into 'output';";
    pp = Util.buildPp(ps, query);
    mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    jcc = new JobControlCompiler(pc, conf);
    jc = jcc.compile(mrPlan, "Test");
    job = jc.getWaitingJobs().get(0);

    Util.assertParallelValues(-1, 2, -1, 2, job.getJobConf());

    final byte[] COLUMNFAMILY = Bytes.toBytes("pig");
    util.createTable(Bytes.toBytesBinary("test_table"), COLUMNFAMILY);

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as
    // hbase
    query =
        "a = load 'hbase://test_table' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');"
            + "b = group a by $0 ;"
            + "store b into 'output';";
    pp = Util.buildPp(ps, query);
    mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");

    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    jcc = new JobControlCompiler(pc, conf);
    jc = jcc.compile(mrPlan, "Test");
    job = jc.getWaitingJobs().get(0);

    Util.assertParallelValues(-1, -1, 1, 1, job.getJobConf());

    util.deleteTable(Bytes.toBytesBinary("test_table"));
    // In HBase 0.90.1 and above we can use util.shutdownMiniHBaseCluster()
    // here instead.
    MiniHBaseCluster hbc = util.getHBaseCluster();
    if (hbc != null) {
      hbc.shutdown();
      hbc.join();
    }
    util.shutdownMiniZKCluster();
  }
    /**
     * This method copies all class files present in the local temp directory to the distributed
     * cache. All copied files will have a symlink of their name. No files will be copied if the
     * current job is being run from local mode.
     *
     * @param pigContext
     * @param conf
     */
    private void internalCopyAllGeneratedToDistributedCache() {
      LOG.info("Starting process to move generated code to distributed cacche");
      if (pigContext.getExecType().isLocal()) {
        String codePath = codeDir.getAbsolutePath();
        LOG.info(
            "Distributed cache not supported or needed in local mode. Setting key ["
                + LOCAL_CODE_DIR
                + "] with code temp directory: "
                + codePath);
        conf.set(LOCAL_CODE_DIR, codePath);
        return;
      } else {
        // This let's us avoid NPE in some of the non-traditional pipelines
        String codePath = codeDir.getAbsolutePath();
        conf.set(LOCAL_CODE_DIR, codePath);
      }
      DistributedCache.createSymlink(conf); // we will read using symlinks
      StringBuilder serialized = new StringBuilder();
      boolean first = true;
      // We attempt to copy over every file in the generated code temp directory
      for (File f : codeDir.listFiles()) {
        if (first) {
          first = false;
        } else {
          serialized.append(",");
        }
        String symlink = f.getName(); // the class name will also be the symlink
        serialized.append(symlink);
        Path src = new Path(f.toURI());
        Path dst;
        try {
          dst = FileLocalizer.getTemporaryPath(pigContext);
        } catch (IOException e) {
          throw new RuntimeException("Error getting temporary path in HDFS", e);
        }
        FileSystem fs;
        try {
          fs = dst.getFileSystem(conf);
        } catch (IOException e) {
          throw new RuntimeException("Unable to get FileSystem", e);
        }
        try {
          fs.copyFromLocalFile(src, dst);
          fs.setReplication(dst, (short) conf.getInt(MRConfiguration.SUMIT_REPLICATION, 3));
        } catch (IOException e) {
          throw new RuntimeException(
              "Unable to copy from local filesystem to HDFS, src = " + src + ", dst = " + dst, e);
        }

        String destination = dst.toString() + "#" + symlink;

        try {
          DistributedCache.addCacheFile(new URI(destination), conf);
        } catch (URISyntaxException e) {
          throw new RuntimeException("Unable to add file to distributed cache: " + destination, e);
        }
        LOG.info("File successfully added to the distributed cache: " + symlink);
      }
      String toSer = serialized.toString();
      LOG.info(
          "Setting key ["
              + GENERATED_CLASSES_KEY
              + "] with classes to deserialize ["
              + toSer
              + "]");
      // we must set a key in the job conf so individual jobs know to resolve the shipped classes
      conf.set(GENERATED_CLASSES_KEY, toSer);
    }
 public DefaultInputHandler(HandleSpec spec) {
   serializer = (PigToStream) PigContext.instantiateFuncFromSpec(spec.spec);
 }
示例#24
0
  /**
   * The Main-Class for the Pig Jar that will provide a shell and setup a classpath appropriate for
   * executing Jar files.
   *
   * @param args -jar can be used to add additional jar files (colon separated). - will start a
   *     shell. -e will execute the rest of the command line as if it was input to the shell.
   * @throws IOException
   */
  public static void main(String args[]) {
    int rc = 1;
    Properties properties = new Properties();
    PropertiesUtil.loadPropertiesFromFile(properties);

    boolean verbose = false;
    boolean gruntCalled = false;
    String logFileName = null;

    try {
      BufferedReader pin = null;
      boolean debug = false;
      boolean dryrun = false;
      ArrayList<String> params = new ArrayList<String>();
      ArrayList<String> paramFiles = new ArrayList<String>();
      HashSet<String> optimizerRules = new HashSet<String>();

      CmdLineParser opts = new CmdLineParser(args);
      opts.registerOpt('4', "log4jconf", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('b', "brief", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('c', "cluster", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('d', "debug", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('e', "execute", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('f', "file", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('h', "help", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('i', "version", CmdLineParser.ValueExpected.OPTIONAL);
      opts.registerOpt('j', "jar", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('l', "logfile", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('m', "param_file", CmdLineParser.ValueExpected.OPTIONAL);
      opts.registerOpt('o', "hod", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('p', "param", CmdLineParser.ValueExpected.OPTIONAL);
      opts.registerOpt('r', "dryrun", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('t', "optimizer_off", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('v', "verbose", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('w', "warning", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('x', "exectype", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('F', "stop_on_failure", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('M', "no_multiquery", CmdLineParser.ValueExpected.NOT_ACCEPTED);

      ExecMode mode = ExecMode.UNKNOWN;
      String file = null;
      ExecType execType = ExecType.MAPREDUCE;
      String execTypeString = properties.getProperty("exectype");
      if (execTypeString != null && execTypeString.length() > 0) {
        execType = PigServer.parseExecType(execTypeString);
      }
      String cluster = "local";
      String clusterConfigured = properties.getProperty("cluster");
      if (clusterConfigured != null && clusterConfigured.length() > 0) {
        cluster = clusterConfigured;
      }

      // by default warning aggregation is on
      properties.setProperty("aggregate.warning", "" + true);

      // by default multiquery optimization is on
      properties.setProperty("opt.multiquery", "" + true);

      // by default we keep going on error on the backend
      properties.setProperty("stop.on.failure", "" + false);

      char opt;
      while ((opt = opts.getNextOpt()) != CmdLineParser.EndOfOpts) {
        switch (opt) {
          case '4':
            String log4jconf = opts.getValStr();
            if (log4jconf != null) {
              properties.setProperty(LOG4J_CONF, log4jconf);
            }
            break;

          case 'b':
            properties.setProperty(BRIEF, "true");
            break;

          case 'c':
            // Needed away to specify the cluster to run the MR job on
            // Bug 831708 - fixed
            String clusterParameter = opts.getValStr();
            if (clusterParameter != null && clusterParameter.length() > 0) {
              cluster = clusterParameter;
            }
            break;

          case 'd':
            String logLevel = opts.getValStr();
            if (logLevel != null) {
              properties.setProperty(DEBUG, logLevel);
            }
            debug = true;
            break;

          case 'e':
            mode = ExecMode.STRING;
            break;

          case 'f':
            mode = ExecMode.FILE;
            file = opts.getValStr();
            break;

          case 'F':
            properties.setProperty("stop.on.failure", "" + true);
            break;

          case 'h':
            usage();
            return;

          case 'i':
            System.out.println(getVersionString());
            return;

          case 'j':
            String jarsString = opts.getValStr();
            if (jarsString != null) {
              properties.setProperty(JAR, jarsString);
            }
            break;

          case 'l':
            // call to method that validates the path to the log file
            // and sets up the file to store the client side log file
            String logFileParameter = opts.getValStr();
            if (logFileParameter != null && logFileParameter.length() > 0) {
              logFileName = validateLogFile(logFileParameter, null);
            } else {
              logFileName = validateLogFile(logFileName, null);
            }
            properties.setProperty("pig.logfile", logFileName);
            break;

          case 'm':
            paramFiles.add(opts.getValStr());
            break;

          case 'M':
            // turns off multiquery optimization
            properties.setProperty("opt.multiquery", "" + false);
            break;

          case 'o':
            // TODO sgroschupf using system properties is always a very bad idea
            String gateway = System.getProperty("ssh.gateway");
            if (gateway == null || gateway.length() == 0) {
              properties.setProperty("hod.server", "local");
            } else {
              properties.setProperty("hod.server", System.getProperty("ssh.gateway"));
            }
            break;

          case 'p':
            String val = opts.getValStr();
            params.add(opts.getValStr());
            break;

          case 'r':
            // currently only used for parameter substitution
            // will be extended in the future
            dryrun = true;
            break;

          case 't':
            optimizerRules.add(opts.getValStr());
            break;

          case 'v':
            properties.setProperty(VERBOSE, "" + true);
            verbose = true;
            break;

          case 'w':
            properties.setProperty("aggregate.warning", "" + false);
            break;

          case 'x':
            try {
              execType = PigServer.parseExecType(opts.getValStr());
            } catch (IOException e) {
              throw new RuntimeException("ERROR: Unrecognized exectype.", e);
            }
            break;
          default:
            {
              Character cc = new Character(opt);
              throw new AssertionError("Unhandled option " + cc.toString());
            }
        }
      }
      // configure logging
      configureLog4J(properties);
      // create the context with the parameter
      PigContext pigContext = new PigContext(execType, properties);

      if (logFileName == null) {
        logFileName = validateLogFile(null, null);
      }

      pigContext.getProperties().setProperty("pig.logfile", logFileName);

      if (optimizerRules.size() > 0) {
        pigContext
            .getProperties()
            .setProperty("pig.optimizer.rules", ObjectSerializer.serialize(optimizerRules));
      }

      LogicalPlanBuilder.classloader = pigContext.createCl(null);

      // construct the parameter substitution preprocessor
      Grunt grunt = null;
      BufferedReader in;
      String substFile = null;
      switch (mode) {
        case FILE:
          {
            // Run, using the provided file as a pig file
            in = new BufferedReader(new FileReader(file));

            // run parameter substitution preprocessor first
            substFile = file + ".substituted";
            pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun);
            if (dryrun) {
              log.info("Dry run completed. Substituted pig script is at " + substFile);
              return;
            }

            logFileName = validateLogFile(logFileName, file);
            pigContext.getProperties().setProperty("pig.logfile", logFileName);

            // Set job name based on name of the script
            pigContext
                .getProperties()
                .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(file).getName());

            if (!debug) {
              new File(substFile).deleteOnExit();
            }

            grunt = new Grunt(pin, pigContext);
            gruntCalled = true;
            int results[] = grunt.exec();
            rc = getReturnCodeForStats(results);
            return;
          }

        case STRING:
          {
            // Gather up all the remaining arguments into a string and pass them into
            // grunt.
            StringBuffer sb = new StringBuffer();
            String remainders[] = opts.getRemainingArgs();
            for (int i = 0; i < remainders.length; i++) {
              if (i != 0) sb.append(' ');
              sb.append(remainders[i]);
            }
            in = new BufferedReader(new StringReader(sb.toString()));
            grunt = new Grunt(in, pigContext);
            gruntCalled = true;
            int results[] = grunt.exec();
            rc = getReturnCodeForStats(results);
            return;
          }

        default:
          break;
      }

      // If we're here, we don't know yet what they want.  They may have just
      // given us a jar to execute, they might have given us a pig script to
      // execute, or they might have given us a dash (or nothing) which means to
      // run grunt interactive.
      String remainders[] = opts.getRemainingArgs();
      if (remainders == null) {
        // Interactive
        mode = ExecMode.SHELL;
        ConsoleReader reader = new ConsoleReader(System.in, new OutputStreamWriter(System.out));
        reader.setDefaultPrompt("grunt> ");
        final String HISTORYFILE = ".pig_history";
        String historyFile = System.getProperty("user.home") + File.separator + HISTORYFILE;
        reader.setHistory(new History(new File(historyFile)));
        ConsoleReaderInputStream inputStream = new ConsoleReaderInputStream(reader);
        grunt = new Grunt(new BufferedReader(new InputStreamReader(inputStream)), pigContext);
        grunt.setConsoleReader(reader);
        gruntCalled = true;
        grunt.run();
        rc = 0;
        return;
      } else {
        // They have a pig script they want us to run.
        if (remainders.length > 1) {
          throw new RuntimeException(
              "You can only run one pig script " + "at a time from the command line.");
        }
        mode = ExecMode.FILE;
        in = new BufferedReader(new FileReader(remainders[0]));

        // run parameter substitution preprocessor first
        substFile = remainders[0] + ".substituted";
        pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun);
        if (dryrun) {
          log.info("Dry run completed. Substituted pig script is at " + substFile);
          return;
        }

        logFileName = validateLogFile(logFileName, remainders[0]);
        pigContext.getProperties().setProperty("pig.logfile", logFileName);

        if (!debug) {
          new File(substFile).deleteOnExit();
        }

        // Set job name based on name of the script
        pigContext
            .getProperties()
            .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(remainders[0]).getName());

        grunt = new Grunt(pin, pigContext);
        gruntCalled = true;
        int[] results = grunt.exec();
        rc = getReturnCodeForStats(results);
        return;
      }

      // Per Utkarsh and Chris invocation of jar file via pig depricated.
    } catch (ParseException e) {
      usage();
      rc = 2;
    } catch (NumberFormatException e) {
      usage();
      rc = 2;
    } catch (PigException pe) {
      if (pe.retriable()) {
        rc = 1;
      } else {
        rc = 2;
      }

      if (!gruntCalled) {
        LogUtils.writeLog(pe, logFileName, log, verbose);
      }
    } catch (Throwable e) {
      rc = 2;
      if (!gruntCalled) {
        LogUtils.writeLog(e, logFileName, log, verbose);
      }
    } finally {
      // clear temp files
      FileLocalizer.deleteTempFiles();
      PerformanceTimerFactory.getPerfTimerFactory().dumpTimers();
      System.exit(rc);
    }
  }