public static JobControl createValueAggregatorJobs(
      String args[], Class<? extends ValueAggregatorDescriptor>[] descriptors) throws IOException {

    JobControl theControl = new JobControl("ValueAggregatorJobs");
    ArrayList<Job> dependingJobs = new ArrayList<Job>();
    JobConf aJobConf = createValueAggregatorJob(args);
    if (descriptors != null) setAggregatorDescriptors(aJobConf, descriptors);
    Job aJob = new Job(aJobConf, dependingJobs);
    theControl.addJob(aJob);
    return theControl;
  }
Beispiel #2
0
  public static void main(String[] args) throws IOException {

    if (args.length != 3) {
      System.out.println("Parameters: inputDir outputDir parallel");
      System.exit(1);
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    JobConf lp = new JobConf(L10.class);
    lp.setJobName("L10 Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(MyType.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setReducerClass(Group.class);
    lp.setPartitionerClass(MyPartitioner.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
      lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/L10out"));
    // Hardcode the parallel to 40 since MyPartitioner assumes it
    lp.setNumReduceTasks(40);
    Job group = new Job(lp);

    JobControl jc = new JobControl("L10 join");
    jc.addJob(group);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
      ArrayList<Job> failures = jc.getFailedJobs();
      if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
          System.err.println(failure.getMessage());
        }
        break;
      }

      try {
        Thread.sleep(5000);
      } catch (InterruptedException e) {
      }

      if (i % 10000 == 0) {
        System.out.println("Running jobs");
        ArrayList<Job> running = jc.getRunningJobs();
        if (running != null && running.size() > 0) {
          for (Job r : running) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Ready jobs");
        ArrayList<Job> ready = jc.getReadyJobs();
        if (ready != null && ready.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Waiting jobs");
        ArrayList<Job> waiting = jc.getWaitingJobs();
        if (waiting != null && waiting.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Successful jobs");
        ArrayList<Job> success = jc.getSuccessfulJobs();
        if (success != null && success.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
      }
      i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
      for (Job failure : failures) {
        System.err.println(failure.getMessage());
      }
    }
    jc.stop();
  }
Beispiel #3
0
  @Test
  public void testReducerNumEstimationForOrderBy() throws Exception {
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getProperties().setProperty("pig.exec.reducers.max", "10");

    String query = "a = load '/passwd';" + "b = order a by $0;" + "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);

    MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jobControl = jcc.compile(mrPlan, query);

    assertEquals(2, mrPlan.size());

    // first job uses a single reducer for the sampling
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    // Simulate the first job having run so estimation kicks in.
    MapReduceOper sort = mrPlan.getLeaves().get(0);
    jcc.updateMROpPlan(jobControl.getReadyJobs());
    FileLocalizer.create(sort.getQuantFile(), pc);
    jobControl = jcc.compile(mrPlan, query);

    sort = mrPlan.getLeaves().get(0);
    long reducer =
        Math.min(
            (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0),
            10);
    assertEquals(reducer, sort.getRequestedParallelism());

    // the second job estimates reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);

    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);
    assertEquals(2, sort.getRequestedParallelism());

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as
    // hbase
    query =
        "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');"
            + "b = order a by $0 ;"
            + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);

    // the requested parallel will be -1 if users don't set any of default_parallel, paralllel
    // and the estimation doesn't take effect. MR framework will finally set it to 1.
    assertEquals(-1, sort.getRequestedParallelism());

    // test order by with three jobs (after optimization)
    query =
        "a = load '/passwd';"
            + "b = foreach a generate $0, $1, $2;"
            + "c = order b by $0;"
            + "store c into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(3, mrPlan.size());

    // Simulate the first 2 jobs having run so estimation kicks in.
    sort = mrPlan.getLeaves().get(0);
    FileLocalizer.create(sort.getQuantFile(), pc);

    jobControl = jcc.compile(mrPlan, query);
    Util.copyFromLocalToCluster(
        cluster,
        "test/org/apache/pig/test/data/passwd",
        ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName());

    // First job is just foreach with projection, mapper-only job, so estimate gets ignored
    Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf());

    jcc.updateMROpPlan(jobControl.getReadyJobs());
    jobControl = jcc.compile(mrPlan, query);
    jcc.updateMROpPlan(jobControl.getReadyJobs());

    // Second job is a sampler, which requests and gets 1 reducer
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    jobControl = jcc.compile(mrPlan, query);
    sort = mrPlan.getLeaves().get(0);
    assertEquals(reducer, sort.getRequestedParallelism());

    // Third job is the order, which uses the estimated number of reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());
  }
Beispiel #4
0
  @Test
  public void testReducerNumEstimation() throws Exception {
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    Configuration conf = HBaseConfiguration.create(new Configuration());
    HBaseTestingUtility util = new HBaseTestingUtility(conf);
    int clientPort = util.startMiniZKCluster().getClientPort();
    util.startMiniHBaseCluster(1, 1);

    String query = "a = load '/passwd';" + "b = group a by $0;" + "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");
    pc.getConf().setProperty(HConstants.ZOOKEEPER_CLIENT_PORT, Integer.toString(clientPort));
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jc = jcc.compile(mrPlan, "Test");
    Job job = jc.getWaitingJobs().get(0);
    long reducer =
        Math.min(
            (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0),
            10);

    Util.assertParallelValues(-1, -1, reducer, reducer, job.getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" + "b = group a by $0 PARALLEL 2;" + "store b into 'output';";
    pp = Util.buildPp(ps, query);
    mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    jcc = new JobControlCompiler(pc, conf);
    jc = jcc.compile(mrPlan, "Test");
    job = jc.getWaitingJobs().get(0);

    Util.assertParallelValues(-1, 2, -1, 2, job.getJobConf());

    final byte[] COLUMNFAMILY = Bytes.toBytes("pig");
    util.createTable(Bytes.toBytesBinary("test_table"), COLUMNFAMILY);

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as
    // hbase
    query =
        "a = load 'hbase://test_table' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');"
            + "b = group a by $0 ;"
            + "store b into 'output';";
    pp = Util.buildPp(ps, query);
    mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");

    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    jcc = new JobControlCompiler(pc, conf);
    jc = jcc.compile(mrPlan, "Test");
    job = jc.getWaitingJobs().get(0);

    Util.assertParallelValues(-1, -1, 1, 1, job.getJobConf());

    util.deleteTable(Bytes.toBytesBinary("test_table"));
    // In HBase 0.90.1 and above we can use util.shutdownMiniHBaseCluster()
    // here instead.
    MiniHBaseCluster hbc = util.getHBaseCluster();
    if (hbc != null) {
      hbc.shutdown();
      hbc.join();
    }
    util.shutdownMiniZKCluster();
  }
Beispiel #5
0
  public static void main(String[] args) throws IOException {

    JobConf lp = new JobConf(L4.class);
    lp.setJobName("Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setCombinerClass(Combiner.class);
    lp.setReducerClass(Group.class);
    Properties props = System.getProperties();
    String dataDir = props.getProperty("PIGMIX_DIR", "/user/pig/tests/data/pigmix");
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
      lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(dataDir, "page_views"));
    FileOutputFormat.setOutputPath(
        lp, new Path("/user/" + System.getProperty("user.name") + "/L4out"));
    lp.setNumReduceTasks(40);
    Job group = new Job(lp);

    JobControl jc = new JobControl("L4 join");
    jc.addJob(group);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
      ArrayList<Job> failures = jc.getFailedJobs();
      if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
          System.err.println(failure.getMessage());
        }
        break;
      }

      try {
        Thread.sleep(5000);
      } catch (InterruptedException e) {
      }

      if (i % 10000 == 0) {
        System.out.println("Running jobs");
        ArrayList<Job> running = jc.getRunningJobs();
        if (running != null && running.size() > 0) {
          for (Job r : running) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Ready jobs");
        ArrayList<Job> ready = jc.getReadyJobs();
        if (ready != null && ready.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Waiting jobs");
        ArrayList<Job> waiting = jc.getWaitingJobs();
        if (waiting != null && waiting.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Successful jobs");
        ArrayList<Job> success = jc.getSuccessfulJobs();
        if (success != null && success.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
      }
      i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
      for (Job failure : failures) {
        System.err.println(failure.getMessage());
      }
    }
    jc.stop();
  }
  /** @param args */
  public static void main(String[] args) {

    File inputFile = new File(args[0]);
    File frameFile = new File(args[1]);
    File tempDir = new File(args[2]);
    String dbPath = args[3];

    try {
      JobControl jobControl = new JobControl("jsonld-entities");

      JobConf defaultConf = new JobConf();

      // Map the triples into JSON-LD fragments
      JobConf initialLoadConf = new JobConf(defaultConf);
      initialLoadConf.setInt("rank", 0);
      initialLoadConf.setStrings("frame-file", frameFile.toString());
      initialLoadConf.setMapperClass(TripleMapper.class);
      initialLoadConf.setReducerClass(EntityReducer.class);
      initialLoadConf.setInputFormat(TextInputFormat.class);
      initialLoadConf.setOutputFormat(TextOutputFormat.class);
      initialLoadConf.setMapOutputKeyClass(Text.class);
      initialLoadConf.setMapOutputValueClass(Text.class);
      initialLoadConf.setOutputKeyClass(Text.class);
      initialLoadConf.setOutputValueClass(Text.class);
      FileInputFormat.setInputPaths(initialLoadConf, new Path(inputFile.toString()));
      Path outputPath = new Path(tempDir.toString() + "/stage0");
      FileOutputFormat.setOutputPath(initialLoadConf, outputPath);
      Path prevOutput = outputPath;
      Job initialLoad = new Job(initialLoadConf);
      jobControl.addJob(initialLoad);

      // Aggregate JSON-LD fragments into nested structure
      EntityFrame entityFrame = new EntityFrame();
      entityFrame.parse(frameFile);
      Job prevJob = initialLoad;
      for (int rank = 1; rank <= entityFrame.getMaxRank(); rank++) {
        JobConf conf = new JobConf(defaultConf);
        conf.setInt("rank", rank);
        conf.setStrings("frame-file", frameFile.toString());
        conf.setMapperClass(IdentityMapper.class);
        conf.setReducerClass(EntityReducer.class);
        conf.setInputFormat(KeyValueTextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        FileInputFormat.setInputPaths(conf, prevOutput);
        outputPath = new Path(tempDir.toString() + "/stage" + rank);
        FileOutputFormat.setOutputPath(conf, outputPath);
        prevOutput = outputPath;
        Job buildEntityJob = new Job(conf);
        jobControl.addJob(buildEntityJob);
        buildEntityJob.addDependingJob(prevJob);
        prevJob = buildEntityJob;
      }

      // Frame nested data
      JobConf frameConf = new JobConf(defaultConf);
      frameConf.setStrings("frame-file", frameFile.toString());
      frameConf.setMapperClass(IdentityMapper.class);
      frameConf.setReducerClass(EntityFrameReducer.class);
      frameConf.setInputFormat(KeyValueTextInputFormat.class);
      frameConf.setOutputFormat(MongoOutputFormat.class);
      frameConf.set("mongo.output.uri", dbPath);
      frameConf.set(
          "stream.io.identifier.resolver.class", "com.mongodb.hadoop.mapred.MongoOutputFormat");
      frameConf.setMapOutputKeyClass(Text.class);
      frameConf.setMapOutputValueClass(Text.class);
      frameConf.setOutputKeyClass(NullWritable.class);
      frameConf.setOutputValueClass(MongoUpdateWritable.class);
      FileInputFormat.setInputPaths(frameConf, prevOutput);
      Job frameEntitiesJob = new Job(frameConf);
      jobControl.addJob(frameEntitiesJob);
      frameEntitiesJob.addDependingJob(prevJob);

      FileSystem fs = FileSystem.get(defaultConf);
      fs.delete(new Path(tempDir.toString()), true);

      // Run pipeline
      jobControl.run();

    } catch (IOException e) {
      // TODO(simister): Auto-generated catch block
      e.printStackTrace();
    }
  }