예제 #1
0
 @Before
 public void setUp() throws Exception {
   curDir = System.getProperty("user.dir");
   inpDir = curDir + File.separatorChar + "test/org/apache/pig/test/data/InputFiles/";
   golDir = curDir + File.separatorChar + "test/org/apache/pig/test/data/GoldenFiles/";
   if (Util.WINDOWS) {
     inpDir = "/" + FileLocalizer.parseCygPath(inpDir, FileLocalizer.STYLE_WINDOWS);
     golDir = "/" + FileLocalizer.parseCygPath(golDir, FileLocalizer.STYLE_WINDOWS);
   }
 }
예제 #2
0
파일: TestMRCompiler.java 프로젝트: scr/pig
 @Before
 public void setUp() throws ExecException {
   GenPhyOp.setR(r);
   GenPhyOp.setPc(pc);
   // Set random seed to generate deterministic temporary paths
   FileLocalizer.setR(new Random(1331L));
   NodeIdGenerator.reset("");
   pigServer = new PigServer(pc);
   pigServerMR = new PigServer(pcMR);
 }
예제 #3
0
  /**
   * . Given a path, which may represent a glob pattern, a directory, comma separated files/glob
   * patterns or a file, this method finds the set of relevant metadata files on the storage system.
   * The algorithm for finding the metadata file is as follows:
   *
   * <p>For each object represented by the path (either directly, or via a glob): If object is a
   * directory, and path/metaname exists, use that as the metadata file. Else if parentPath/metaname
   * exists, use that as the metadata file.
   *
   * <p>Resolving conflicts, merging the metadata, etc, is not handled by this method and should be
   * taken care of by downstream code.
   *
   * <p>
   *
   * @param path Path, as passed in to a LoadFunc (may be a Hadoop glob)
   * @param metaname Metadata file designation, such as .pig_schema or .pig_stats
   * @param conf configuration object
   * @return Set of element descriptors for all metadata files associated with the files on the
   *     path.
   */
  protected Set<ElementDescriptor> findMetaFile(String path, String metaname, Configuration conf)
      throws IOException {
    Set<ElementDescriptor> metaFileSet = new HashSet<ElementDescriptor>();
    String[] locations = LoadFunc.getPathStrings(path);
    for (String loc : locations) {
      DataStorage storage;

      storage = new HDataStorage(new Path(loc).toUri(), ConfigurationUtil.toProperties(conf));

      String fullPath = FileLocalizer.fullPath(loc, storage);

      if (storage.isContainer(fullPath)) {
        ElementDescriptor metaFilePath = storage.asElement(fullPath, metaname);
        if (exists(metaFilePath)) {
          metaFileSet.add(metaFilePath);
        }
      } else {
        ElementDescriptor[] descriptors = storage.asCollection(loc);
        for (ElementDescriptor descriptor : descriptors) {
          ContainerDescriptor container = null;

          if (descriptor instanceof HFile) {
            Path descriptorPath = ((HPath) descriptor).getPath();
            Path parent = descriptorPath.getParent();
            container = new HDirectory((HDataStorage) storage, parent);
          } else { // descriptor instanceof HDirectory
            container = (HDirectory) descriptor;
          }

          // if no custom schema, try the parent directory
          ElementDescriptor metaFilePath = storage.asElement(container, metaname);
          if (exists(metaFilePath)) {
            metaFileSet.add(metaFilePath);
          }
        }
      }
    }
    return metaFileSet;
  }
예제 #4
0
  @Test
  public void testReducerNumEstimationForOrderBy() throws Exception {
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getProperties().setProperty("pig.exec.reducers.max", "10");

    String query = "a = load '/passwd';" + "b = order a by $0;" + "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);

    MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jobControl = jcc.compile(mrPlan, query);

    assertEquals(2, mrPlan.size());

    // first job uses a single reducer for the sampling
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    // Simulate the first job having run so estimation kicks in.
    MapReduceOper sort = mrPlan.getLeaves().get(0);
    jcc.updateMROpPlan(jobControl.getReadyJobs());
    FileLocalizer.create(sort.getQuantFile(), pc);
    jobControl = jcc.compile(mrPlan, query);

    sort = mrPlan.getLeaves().get(0);
    long reducer =
        Math.min(
            (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0),
            10);
    assertEquals(reducer, sort.getRequestedParallelism());

    // the second job estimates reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);

    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);
    assertEquals(2, sort.getRequestedParallelism());

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as
    // hbase
    query =
        "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');"
            + "b = order a by $0 ;"
            + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);

    // the requested parallel will be -1 if users don't set any of default_parallel, paralllel
    // and the estimation doesn't take effect. MR framework will finally set it to 1.
    assertEquals(-1, sort.getRequestedParallelism());

    // test order by with three jobs (after optimization)
    query =
        "a = load '/passwd';"
            + "b = foreach a generate $0, $1, $2;"
            + "c = order b by $0;"
            + "store c into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(3, mrPlan.size());

    // Simulate the first 2 jobs having run so estimation kicks in.
    sort = mrPlan.getLeaves().get(0);
    FileLocalizer.create(sort.getQuantFile(), pc);

    jobControl = jcc.compile(mrPlan, query);
    Util.copyFromLocalToCluster(
        cluster,
        "test/org/apache/pig/test/data/passwd",
        ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName());

    // First job is just foreach with projection, mapper-only job, so estimate gets ignored
    Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf());

    jcc.updateMROpPlan(jobControl.getReadyJobs());
    jobControl = jcc.compile(mrPlan, query);
    jcc.updateMROpPlan(jobControl.getReadyJobs());

    // Second job is a sampler, which requests and gets 1 reducer
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    jobControl = jcc.compile(mrPlan, query);
    sort = mrPlan.getLeaves().get(0);
    assertEquals(reducer, sort.getRequestedParallelism());

    // Third job is the order, which uses the estimated number of reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());
  }
예제 #5
0
  /**
   * The Main-Class for the Pig Jar that will provide a shell and setup a classpath appropriate for
   * executing Jar files.
   *
   * @param args -jar can be used to add additional jar files (colon separated). - will start a
   *     shell. -e will execute the rest of the command line as if it was input to the shell.
   * @throws IOException
   */
  public static void main(String args[]) {
    int rc = 1;
    Properties properties = new Properties();
    PropertiesUtil.loadPropertiesFromFile(properties);

    boolean verbose = false;
    boolean gruntCalled = false;
    String logFileName = null;

    try {
      BufferedReader pin = null;
      boolean debug = false;
      boolean dryrun = false;
      ArrayList<String> params = new ArrayList<String>();
      ArrayList<String> paramFiles = new ArrayList<String>();
      HashSet<String> optimizerRules = new HashSet<String>();

      CmdLineParser opts = new CmdLineParser(args);
      opts.registerOpt('4', "log4jconf", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('b', "brief", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('c', "cluster", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('d', "debug", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('e', "execute", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('f', "file", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('h', "help", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('i', "version", CmdLineParser.ValueExpected.OPTIONAL);
      opts.registerOpt('j', "jar", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('l', "logfile", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('m', "param_file", CmdLineParser.ValueExpected.OPTIONAL);
      opts.registerOpt('o', "hod", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('p', "param", CmdLineParser.ValueExpected.OPTIONAL);
      opts.registerOpt('r', "dryrun", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('t', "optimizer_off", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('v', "verbose", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('w', "warning", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('x', "exectype", CmdLineParser.ValueExpected.REQUIRED);
      opts.registerOpt('F', "stop_on_failure", CmdLineParser.ValueExpected.NOT_ACCEPTED);
      opts.registerOpt('M', "no_multiquery", CmdLineParser.ValueExpected.NOT_ACCEPTED);

      ExecMode mode = ExecMode.UNKNOWN;
      String file = null;
      ExecType execType = ExecType.MAPREDUCE;
      String execTypeString = properties.getProperty("exectype");
      if (execTypeString != null && execTypeString.length() > 0) {
        execType = PigServer.parseExecType(execTypeString);
      }
      String cluster = "local";
      String clusterConfigured = properties.getProperty("cluster");
      if (clusterConfigured != null && clusterConfigured.length() > 0) {
        cluster = clusterConfigured;
      }

      // by default warning aggregation is on
      properties.setProperty("aggregate.warning", "" + true);

      // by default multiquery optimization is on
      properties.setProperty("opt.multiquery", "" + true);

      // by default we keep going on error on the backend
      properties.setProperty("stop.on.failure", "" + false);

      char opt;
      while ((opt = opts.getNextOpt()) != CmdLineParser.EndOfOpts) {
        switch (opt) {
          case '4':
            String log4jconf = opts.getValStr();
            if (log4jconf != null) {
              properties.setProperty(LOG4J_CONF, log4jconf);
            }
            break;

          case 'b':
            properties.setProperty(BRIEF, "true");
            break;

          case 'c':
            // Needed away to specify the cluster to run the MR job on
            // Bug 831708 - fixed
            String clusterParameter = opts.getValStr();
            if (clusterParameter != null && clusterParameter.length() > 0) {
              cluster = clusterParameter;
            }
            break;

          case 'd':
            String logLevel = opts.getValStr();
            if (logLevel != null) {
              properties.setProperty(DEBUG, logLevel);
            }
            debug = true;
            break;

          case 'e':
            mode = ExecMode.STRING;
            break;

          case 'f':
            mode = ExecMode.FILE;
            file = opts.getValStr();
            break;

          case 'F':
            properties.setProperty("stop.on.failure", "" + true);
            break;

          case 'h':
            usage();
            return;

          case 'i':
            System.out.println(getVersionString());
            return;

          case 'j':
            String jarsString = opts.getValStr();
            if (jarsString != null) {
              properties.setProperty(JAR, jarsString);
            }
            break;

          case 'l':
            // call to method that validates the path to the log file
            // and sets up the file to store the client side log file
            String logFileParameter = opts.getValStr();
            if (logFileParameter != null && logFileParameter.length() > 0) {
              logFileName = validateLogFile(logFileParameter, null);
            } else {
              logFileName = validateLogFile(logFileName, null);
            }
            properties.setProperty("pig.logfile", logFileName);
            break;

          case 'm':
            paramFiles.add(opts.getValStr());
            break;

          case 'M':
            // turns off multiquery optimization
            properties.setProperty("opt.multiquery", "" + false);
            break;

          case 'o':
            // TODO sgroschupf using system properties is always a very bad idea
            String gateway = System.getProperty("ssh.gateway");
            if (gateway == null || gateway.length() == 0) {
              properties.setProperty("hod.server", "local");
            } else {
              properties.setProperty("hod.server", System.getProperty("ssh.gateway"));
            }
            break;

          case 'p':
            String val = opts.getValStr();
            params.add(opts.getValStr());
            break;

          case 'r':
            // currently only used for parameter substitution
            // will be extended in the future
            dryrun = true;
            break;

          case 't':
            optimizerRules.add(opts.getValStr());
            break;

          case 'v':
            properties.setProperty(VERBOSE, "" + true);
            verbose = true;
            break;

          case 'w':
            properties.setProperty("aggregate.warning", "" + false);
            break;

          case 'x':
            try {
              execType = PigServer.parseExecType(opts.getValStr());
            } catch (IOException e) {
              throw new RuntimeException("ERROR: Unrecognized exectype.", e);
            }
            break;
          default:
            {
              Character cc = new Character(opt);
              throw new AssertionError("Unhandled option " + cc.toString());
            }
        }
      }
      // configure logging
      configureLog4J(properties);
      // create the context with the parameter
      PigContext pigContext = new PigContext(execType, properties);

      if (logFileName == null) {
        logFileName = validateLogFile(null, null);
      }

      pigContext.getProperties().setProperty("pig.logfile", logFileName);

      if (optimizerRules.size() > 0) {
        pigContext
            .getProperties()
            .setProperty("pig.optimizer.rules", ObjectSerializer.serialize(optimizerRules));
      }

      LogicalPlanBuilder.classloader = pigContext.createCl(null);

      // construct the parameter substitution preprocessor
      Grunt grunt = null;
      BufferedReader in;
      String substFile = null;
      switch (mode) {
        case FILE:
          {
            // Run, using the provided file as a pig file
            in = new BufferedReader(new FileReader(file));

            // run parameter substitution preprocessor first
            substFile = file + ".substituted";
            pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun);
            if (dryrun) {
              log.info("Dry run completed. Substituted pig script is at " + substFile);
              return;
            }

            logFileName = validateLogFile(logFileName, file);
            pigContext.getProperties().setProperty("pig.logfile", logFileName);

            // Set job name based on name of the script
            pigContext
                .getProperties()
                .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(file).getName());

            if (!debug) {
              new File(substFile).deleteOnExit();
            }

            grunt = new Grunt(pin, pigContext);
            gruntCalled = true;
            int results[] = grunt.exec();
            rc = getReturnCodeForStats(results);
            return;
          }

        case STRING:
          {
            // Gather up all the remaining arguments into a string and pass them into
            // grunt.
            StringBuffer sb = new StringBuffer();
            String remainders[] = opts.getRemainingArgs();
            for (int i = 0; i < remainders.length; i++) {
              if (i != 0) sb.append(' ');
              sb.append(remainders[i]);
            }
            in = new BufferedReader(new StringReader(sb.toString()));
            grunt = new Grunt(in, pigContext);
            gruntCalled = true;
            int results[] = grunt.exec();
            rc = getReturnCodeForStats(results);
            return;
          }

        default:
          break;
      }

      // If we're here, we don't know yet what they want.  They may have just
      // given us a jar to execute, they might have given us a pig script to
      // execute, or they might have given us a dash (or nothing) which means to
      // run grunt interactive.
      String remainders[] = opts.getRemainingArgs();
      if (remainders == null) {
        // Interactive
        mode = ExecMode.SHELL;
        ConsoleReader reader = new ConsoleReader(System.in, new OutputStreamWriter(System.out));
        reader.setDefaultPrompt("grunt> ");
        final String HISTORYFILE = ".pig_history";
        String historyFile = System.getProperty("user.home") + File.separator + HISTORYFILE;
        reader.setHistory(new History(new File(historyFile)));
        ConsoleReaderInputStream inputStream = new ConsoleReaderInputStream(reader);
        grunt = new Grunt(new BufferedReader(new InputStreamReader(inputStream)), pigContext);
        grunt.setConsoleReader(reader);
        gruntCalled = true;
        grunt.run();
        rc = 0;
        return;
      } else {
        // They have a pig script they want us to run.
        if (remainders.length > 1) {
          throw new RuntimeException(
              "You can only run one pig script " + "at a time from the command line.");
        }
        mode = ExecMode.FILE;
        in = new BufferedReader(new FileReader(remainders[0]));

        // run parameter substitution preprocessor first
        substFile = remainders[0] + ".substituted";
        pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun);
        if (dryrun) {
          log.info("Dry run completed. Substituted pig script is at " + substFile);
          return;
        }

        logFileName = validateLogFile(logFileName, remainders[0]);
        pigContext.getProperties().setProperty("pig.logfile", logFileName);

        if (!debug) {
          new File(substFile).deleteOnExit();
        }

        // Set job name based on name of the script
        pigContext
            .getProperties()
            .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(remainders[0]).getName());

        grunt = new Grunt(pin, pigContext);
        gruntCalled = true;
        int[] results = grunt.exec();
        rc = getReturnCodeForStats(results);
        return;
      }

      // Per Utkarsh and Chris invocation of jar file via pig depricated.
    } catch (ParseException e) {
      usage();
      rc = 2;
    } catch (NumberFormatException e) {
      usage();
      rc = 2;
    } catch (PigException pe) {
      if (pe.retriable()) {
        rc = 1;
      } else {
        rc = 2;
      }

      if (!gruntCalled) {
        LogUtils.writeLog(pe, logFileName, log, verbose);
      }
    } catch (Throwable e) {
      rc = 2;
      if (!gruntCalled) {
        LogUtils.writeLog(e, logFileName, log, verbose);
      }
    } finally {
      // clear temp files
      FileLocalizer.deleteTempFiles();
      PerformanceTimerFactory.getPerfTimerFactory().dumpTimers();
      System.exit(rc);
    }
  }
    /**
     * This method copies all class files present in the local temp directory to the distributed
     * cache. All copied files will have a symlink of their name. No files will be copied if the
     * current job is being run from local mode.
     *
     * @param pigContext
     * @param conf
     */
    private void internalCopyAllGeneratedToDistributedCache() {
      LOG.info("Starting process to move generated code to distributed cacche");
      if (pigContext.getExecType().isLocal()) {
        String codePath = codeDir.getAbsolutePath();
        LOG.info(
            "Distributed cache not supported or needed in local mode. Setting key ["
                + LOCAL_CODE_DIR
                + "] with code temp directory: "
                + codePath);
        conf.set(LOCAL_CODE_DIR, codePath);
        return;
      } else {
        // This let's us avoid NPE in some of the non-traditional pipelines
        String codePath = codeDir.getAbsolutePath();
        conf.set(LOCAL_CODE_DIR, codePath);
      }
      DistributedCache.createSymlink(conf); // we will read using symlinks
      StringBuilder serialized = new StringBuilder();
      boolean first = true;
      // We attempt to copy over every file in the generated code temp directory
      for (File f : codeDir.listFiles()) {
        if (first) {
          first = false;
        } else {
          serialized.append(",");
        }
        String symlink = f.getName(); // the class name will also be the symlink
        serialized.append(symlink);
        Path src = new Path(f.toURI());
        Path dst;
        try {
          dst = FileLocalizer.getTemporaryPath(pigContext);
        } catch (IOException e) {
          throw new RuntimeException("Error getting temporary path in HDFS", e);
        }
        FileSystem fs;
        try {
          fs = dst.getFileSystem(conf);
        } catch (IOException e) {
          throw new RuntimeException("Unable to get FileSystem", e);
        }
        try {
          fs.copyFromLocalFile(src, dst);
          fs.setReplication(dst, (short) conf.getInt(MRConfiguration.SUMIT_REPLICATION, 3));
        } catch (IOException e) {
          throw new RuntimeException(
              "Unable to copy from local filesystem to HDFS, src = " + src + ", dst = " + dst, e);
        }

        String destination = dst.toString() + "#" + symlink;

        try {
          DistributedCache.addCacheFile(new URI(destination), conf);
        } catch (URISyntaxException e) {
          throw new RuntimeException("Unable to add file to distributed cache: " + destination, e);
        }
        LOG.info("File successfully added to the distributed cache: " + symlink);
      }
      String toSer = serialized.toString();
      LOG.info(
          "Setting key ["
              + GENERATED_CLASSES_KEY
              + "] with classes to deserialize ["
              + toSer
              + "]");
      // we must set a key in the job conf so individual jobs know to resolve the shipped classes
      conf.set(GENERATED_CLASSES_KEY, toSer);
    }