Пример #1
0
  private String[] getActiveServersList(JobContext context) {

    String[] servers = null;
    try {
      JobClient jc = new JobClient((JobConf) context.getConfiguration());
      ClusterStatus status = jc.getClusterStatus(true);
      Collection<String> atc = status.getActiveTrackerNames();
      servers = new String[atc.size()];
      int s = 0;
      for (String serverInfo : atc) {
        // System.out.println("serverInfo:" + serverInfo);
        StringTokenizer st = new StringTokenizer(serverInfo, ":");
        String trackerName = st.nextToken();
        // System.out.println("trackerName:" + trackerName);
        StringTokenizer st1 = new StringTokenizer(trackerName, "_");
        st1.nextToken();
        servers[s++] = st1.nextToken();
      }

    } catch (IOException e) {
      e.printStackTrace();
    }

    return servers;
  }
Пример #2
0
  /**
   * set up input file which has the list of input files.
   *
   * @return boolean
   * @throws IOException
   */
  private boolean setup() throws IOException {
    estimateSavings();

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobconf);
    Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId);

    LOG.info(JOB_DIR_LABEL + "=" + jobdir);
    jobconf.set(JOB_DIR_LABEL, jobdir.toString());
    Path log = new Path(jobdir, "_logs");

    // The control file should have small size blocks. This helps
    // in spreading out the load from mappers that will be spawned.
    jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE);

    FileOutputFormat.setOutputPath(jobconf, log);
    LOG.info("log=" + log);

    // create operation list
    FileSystem fs = jobdir.getFileSystem(jobconf);
    Path opList = new Path(jobdir, "_" + OP_LIST_LABEL);
    jobconf.set(OP_LIST_LABEL, opList.toString());
    int opCount = 0, synCount = 0;
    SequenceFile.Writer opWriter = null;

    try {
      opWriter =
          SequenceFile.createWriter(
              fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE);
      for (RaidPolicyPathPair p : raidPolicyPathPairList) {
        // If a large set of files are Raided for the first time, files
        // in the same directory that tend to have the same size will end up
        // with the same map. This shuffle mixes things up, allowing a better
        // mix of files.
        java.util.Collections.shuffle(p.srcPaths);
        for (FileStatus st : p.srcPaths) {
          opWriter.append(new Text(st.getPath().toString()), p.policy);
          opCount++;
          if (++synCount > SYNC_FILE_MAX) {
            opWriter.sync();
            synCount = 0;
          }
        }
      }

    } finally {
      if (opWriter != null) {
        opWriter.close();
      }
      fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file
    }
    raidPolicyPathPairList.clear();

    jobconf.setInt(OP_COUNT_LABEL, opCount);
    LOG.info("Number of files=" + opCount);
    jobconf.setNumMapTasks(
        getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers()));
    LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks());
    return opCount != 0;
  }
Пример #3
0
  public static void runSortJob(String... args) throws Exception {

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);

    JobConf job = new JobConf();

    job.setNumReduceTasks(2);

    job.setInputFormat(KeyValueTextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setJarByClass(SampleJob.class);

    output.getFileSystem(job).delete(output, true);

    JobClient jc = new JobClient(job);
    JobClient.setTaskOutputFilter(job, JobClient.TaskStatusFilter.ALL);
    RunningJob rj = jc.submitJob(job);
    try {
      if (!jc.monitorAndPrintJob(job, rj)) {
        System.out.println("Job Failed: " + rj.getFailureInfo());
        throw new IOException("Job failed!");
      }
    } catch (InterruptedException ie) {
      Thread.currentThread().interrupt();
    }
  }
Пример #4
0
  private RunningJob submitAction(Context context, Namespace ns) throws Exception {
    Hive2ActionExecutor ae = new Hive2ActionExecutor();

    WorkflowAction action = context.getAction();

    ae.prepareActionDir(getFileSystem(), context);
    ae.submitLauncher(getFileSystem(), context, action);

    String jobId = action.getExternalId();
    String jobTracker = action.getTrackerUri();
    String consoleUrl = action.getConsoleUrl();
    assertNotNull(jobId);
    assertNotNull(jobTracker);
    assertNotNull(consoleUrl);
    Element e = XmlUtils.parseXml(action.getConf());
    XConfiguration conf =
        new XConfiguration(
            new StringReader(XmlUtils.prettyPrint(e.getChild("configuration", ns)).toString()));
    conf.set("mapred.job.tracker", e.getChildTextTrim("job-tracker", ns));
    conf.set("fs.default.name", e.getChildTextTrim("name-node", ns));
    conf.set("user.name", context.getProtoActionConf().get("user.name"));
    conf.set("group.name", getTestGroup());

    JobConf jobConf = Services.get().get(HadoopAccessorService.class).createJobConf(jobTracker);
    XConfiguration.copy(conf, jobConf);
    String user = jobConf.get("user.name");
    JobClient jobClient =
        Services.get().get(HadoopAccessorService.class).createJobClient(user, jobConf);
    final RunningJob runningJob = jobClient.getJob(JobID.forName(jobId));
    assertNotNull(runningJob);
    return runningJob;
  }
Пример #5
0
 /**
  * Generate the requested number of file splits, with the filename set to the filename of the
  * output file.
  */
 public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
   /** 设置输入分片的个数* */
   JobClient client = new JobClient(job);
   ClusterStatus cluster = client.getClusterStatus();
   /** 如果属性不存在 则返回默认的值 * */
   int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
   long numBytesToWritePerMap =
       job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
   if (numBytesToWritePerMap == 0) {
     System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
   }
   long totalBytesToWrite =
       job.getLong(
           "test.randomwrite.total_bytes",
           numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
   int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
   if (numMaps == 0 && totalBytesToWrite > 0) {
     numMaps = 1;
   }
   System.out.println("numMaps-------" + numMaps);
   InputSplit[] result = new InputSplit[numMaps];
   Path outDir = FileOutputFormat.getOutputPath(job);
   for (int i = 0; i < result.length; ++i) {
     result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null);
   }
   return result;
 }
Пример #6
0
  public void testActionCheck() throws Exception {
    JPAService jpaService = Services.get().get(JPAService.class);
    WorkflowJobBean job =
        this.addRecordToWfJobTable(WorkflowJob.Status.RUNNING, WorkflowInstance.Status.RUNNING);
    WorkflowActionBean action =
        this.addRecordToWfActionTable(job.getId(), "1", WorkflowAction.Status.PREP);
    WorkflowActionGetJPAExecutor wfActionGetCmd = new WorkflowActionGetJPAExecutor(action.getId());

    new ActionStartXCommand(action.getId(), "map-reduce").call();
    action = jpaService.execute(wfActionGetCmd);

    ActionExecutorContext context =
        new ActionXCommand.ActionExecutorContext(job, action, false, false);
    MapReduceActionExecutor actionExecutor = new MapReduceActionExecutor();
    JobConf conf =
        actionExecutor.createBaseHadoopConf(context, XmlUtils.parseXml(action.getConf()));
    String user = conf.get("user.name");
    JobClient jobClient =
        Services.get().get(HadoopAccessorService.class).createJobClient(user, conf);

    String launcherId = action.getExternalId();

    final RunningJob launcherJob = jobClient.getJob(JobID.forName(launcherId));

    waitFor(
        120 * 1000,
        new Predicate() {
          public boolean evaluate() throws Exception {
            return launcherJob.isComplete();
          }
        });
    assertTrue(launcherJob.isSuccessful());
    Map<String, String> actionData =
        LauncherMapperHelper.getActionData(getFileSystem(), context.getActionDir(), conf);
    assertTrue(LauncherMapperHelper.hasIdSwap(actionData));

    new ActionCheckXCommand(action.getId()).call();
    action = jpaService.execute(wfActionGetCmd);
    String mapperId = action.getExternalId();
    String childId = action.getExternalChildIDs();

    assertTrue(launcherId.equals(mapperId));

    final RunningJob mrJob = jobClient.getJob(JobID.forName(childId));

    waitFor(
        120 * 1000,
        new Predicate() {
          public boolean evaluate() throws Exception {
            return mrJob.isComplete();
          }
        });
    assertTrue(mrJob.isSuccessful());

    new ActionCheckXCommand(action.getId()).call();
    action = jpaService.execute(wfActionGetCmd);

    assertEquals("SUCCEEDED", action.getExternalStatus());
  }
Пример #7
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
Пример #8
0
  /**
   * This is the main routine for launching a distributed random write job. It runs 10 maps/node and
   * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything.
   *
   * @throws IOException
   */
  public int run(String[] args) throws Exception {
    if (args.length == 0) {
      System.out.println("Usage: writer <out-dir>");
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    Path outDir = new Path(args[0]);
    JobConf job = new JobConf(getConf());

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);

    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(IdentityReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient client = new JobClient(job);
    ClusterStatus cluster = client.getClusterStatus();
    /** 如果属性不存在 则返回默认的值 * */
    int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
    long numBytesToWritePerMap =
        job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
      System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
      return -2;
    }
    long totalBytesToWrite =
        job.getLong(
            "test.randomwrite.total_bytes",
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
      numMaps = 1;
      job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
    }

    job.setNumMapTasks(numMaps);
    /** 建议型的 * */
    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println(
        "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return 0;
  }
Пример #9
0
  private RunningJob _test(String... arg) throws Exception {
    Path actionDir = getFsTestCaseDir();

    File jar =
        IOUtils.createJar(
            new File(getTestCaseDir()),
            "launcher.jar",
            LauncherMapper.class,
            LauncherSecurityManager.class,
            LauncherException.class,
            LauncherMainTester.class);

    FileSystem fs = getFileSystem();

    Path launcherJar = new Path(actionDir, "launcher.jar");
    fs.copyFromLocalFile(new Path(jar.toString()), launcherJar);

    JobConf jobConf = new JobConf();
    jobConf.set("user.name", getTestUser());
    jobConf.set("group.name", getTestGroup());
    jobConf.setInt("mapred.map.tasks", 1);
    jobConf.setInt("mapred.map.max.attempts", 1);
    jobConf.setInt("mapred.reduce.max.attempts", 1);

    jobConf.set("mapred.job.tracker", getJobTrackerUri());
    jobConf.set("fs.default.name", getNameNodeUri());
    injectKerberosInfo(jobConf);

    LauncherMapper lm = new LauncherMapper();
    lm.setupMainClass(jobConf, LauncherMainTester.class.getName());
    lm.setupMainArguments(jobConf, arg);

    Configuration actionConf = new XConfiguration();
    lm.setupLauncherInfo(jobConf, "1", "1@a", actionDir, "1@a-0", actionConf);

    assertEquals("1", actionConf.get("oozie.job.id"));
    assertEquals("1@a", actionConf.get("oozie.action.id"));

    DistributedCache.addFileToClassPath(new Path(launcherJar.toUri().getPath()), jobConf);

    JobClient jobClient = createJobClient();

    final RunningJob runningJob = jobClient.submitJob(jobConf);

    System.out.println("Action Dir: " + actionDir);
    System.out.println("LauncherMapper ID: " + runningJob.getJobID().toString());

    waitFor(
        180 * 1000,
        new Predicate() {
          public boolean evaluate() throws Exception {
            return runningJob.isComplete();
          }
        });
    return runningJob;
  }
Пример #10
0
  public static Token<
          org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier>
      getJobTrackerDelegationToken(Configuration conf, String userName) throws Exception {
    // LOG.info("getJobTrackerDelegationToken("+conf+","+userName+")");
    JobClient jcl = new JobClient(new JobConf(conf, HCatOutputFormat.class));
    Token<org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier> t =
        jcl.getDelegationToken(new Text(userName));
    // LOG.info("got "+t);
    return t;

    // return null;
  }
Пример #11
0
  private void prepareDeleteIndex(String indexerName) {
    // We do not have to take a lock on the indexer, since once in delete state the indexer cannot
    // be modified anymore by ordinary users.
    boolean canBeDeleted = false;
    try {
      // Read current situation of record and assure it is still actual
      IndexerDefinition indexer = indexerModel.getFreshIndexer(indexerName);
      if (indexer.getLifecycleState() == IndexerDefinition.LifecycleState.DELETE_REQUESTED) {
        canBeDeleted = true;

        String queueSubscriptionId = indexer.getSubscriptionId();
        if (queueSubscriptionId != null) {
          sepModel.removeSubscription(indexer.getSubscriptionId());
          // We leave the subscription ID in the indexer definition FYI
        }

        if (indexer.getActiveBatchBuildInfo() != null) {
          JobClient jobClient = getJobClient();
          String jobId = indexer.getActiveBatchBuildInfo().getJobId();
          RunningJob job = jobClient.getJob(jobId);
          if (job != null) {
            job.killJob();
            log.info("Kill indexer build job for indexer " + indexerName + ", job ID =  " + jobId);
          }
          // Just to be sure...
          jobStatusWatcher.assureWatching(
              indexer.getName(), indexer.getActiveBatchBuildInfo().getJobId());
          canBeDeleted = false;
        }

        if (!canBeDeleted) {
          indexer =
              new IndexerDefinitionBuilder()
                  .startFrom(indexer)
                  .lifecycleState(IndexerDefinition.LifecycleState.DELETING)
                  .build();
          indexerModel.updateIndexerInternal(indexer);
        }
      } else if (indexer.getLifecycleState() == IndexerDefinition.LifecycleState.DELETING) {
        // Check if the build job is already finished, if so, allow delete
        if (indexer.getActiveBatchBuildInfo() == null) {
          canBeDeleted = true;
        }
      }
    } catch (Throwable t) {
      log.error("Error preparing deletion of indexer " + indexerName, t);
    }

    if (canBeDeleted) {
      deleteIndexer(indexerName);
    }
  }
Пример #12
0
  private void checkGeneratedDataAndJobStatus(long inputSize) throws IOException {
    LOG.info("Verify the generated data size.");
    long dataSizeInMB = getDataSizeInMB(new Path(gridmixDir, "input"));
    Assert.assertTrue(
        "Generate data has not matched with given size",
        dataSizeInMB + 0.1 > inputSize || dataSizeInMB - 0.1 < inputSize);

    JobClient jobClient = jtClient.getClient();
    int len = jobClient.getAllJobs().length;
    LOG.info("Verify the job status after completion of job.");
    Assert.assertEquals(
        "Job has not succeeded.",
        JobStatus.SUCCEEDED,
        jobClient.getAllJobs()[len - 1].getRunState());
  }
  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    // vv MaxTemperatureDriverV6
    conf.setProfileEnabled(true);
    conf.setProfileParams(
        "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s");
    conf.setProfileTaskRange(true, "0-2");
    // ^^ MaxTemperatureDriverV6

    JobClient.runJob(conf);
    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }
Пример #15
0
  /**
   * Run the job
   *
   * @param params The Job parameters containing the gramSize, input output folders, defaultCat,
   *     encoding
   */
  public static void runJob(Parameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesClassifierDriver.class);
    conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath"));
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath")));
    Path outPath = new Path(params.get("testDirPath") + "-output");
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setMapperClass(BayesClassifierMapper.class);
    conf.setCombinerClass(BayesClassifierReducer.class);
    conf.setReducerClass(BayesClassifierReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    HadoopUtil.overwriteOutput(outPath);
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

    Path outputFiles = new Path(outPath, "part*");
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params);
    log.info("{}", matrix.summarize());
  }
Пример #16
0
  /**
   * Driver to copy srcPath to destPath depending on required protocol.
   *
   * @param args arguments
   */
  static void copy(final Configuration conf, final Arguments args) throws IOException {
    LOG.info("srcPaths=" + args.srcs);
    LOG.info("destPath=" + args.dst);
    checkSrcPath(conf, args.srcs);

    JobConf job = createJobConf(conf);
    if (args.preservedAttributes != null) {
      job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes);
    }
    if (args.mapredSslConf != null) {
      job.set("dfs.https.client.keystore.resource", args.mapredSslConf);
    }

    // Initialize the mapper
    try {
      setup(conf, job, args);
      JobClient.runJob(job);
      finalize(conf, job, args.dst, args.preservedAttributes);
    } finally {
      // delete tmp
      fullyDelete(job.get(TMP_DIR_LABEL), job);
      // delete jobDirectory
      fullyDelete(job.get(JOB_DIR_LABEL), job);
    }
  }
Пример #17
0
  public static void main(String[] args) throws Exception {
    String input = "hdfs://centos:9000/access.log.10";
    String output = "hdfs://centos:9000/out_kpitime";

    JobConf conf = new JobConf(KPITime.class);
    conf.setJobName("KPITime");
    //        conf.addResource("classpath:/hadoop/core-site.xml");
    //        conf.addResource("classpath:/hadoop/hdfs-site.xml");
    //        conf.addResource("classpath:/hadoop/mapred-site.xml");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(KPITimeMapper.class);
    conf.setCombinerClass(KPITimeReducer.class);
    conf.setReducerClass(KPITimeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);
    System.exit(0);
  }
Пример #18
0
  public static void run(Map<String, String> path) throws IOException {
    JobConf conf = Recommend.config();

    String input = path.get("Step2Input");
    String output = path.get("Step2Output");

    HdfsDAO hdfs = new HdfsDAO(Recommend.HDFS, conf);
    hdfs.rmr(output);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Step2_UserVectorToCooccurrenceMapper.class);
    //        conf.setCombinerClass(Step2_UserVectorToConoccurrenceReducer.class);
    //        conf.setReducerClass(Step2_UserVectorToConoccurrenceReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    RunningJob job = JobClient.runJob(conf);
    while (!job.isComplete()) {
      job.waitForCompletion();
    }
  }
Пример #19
0
  public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: BuildGraph");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(BuildGraph.class);
    conf.setJobName("BuildGraph " + inputPath + " " + ContrailConfig.K);

    ContrailConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(BuildGraphMapper.class);
    conf.setReducerClass(BuildGraphReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }
Пример #20
0
  public void runParseTest(
      String fieldTerminator,
      String lineTerminator,
      String encloser,
      String escape,
      boolean encloseRequired)
      throws IOException {

    ClassLoader prevClassLoader = null;

    String[] argv =
        getArgv(true, fieldTerminator, lineTerminator, encloser, escape, encloseRequired);
    runImport(argv);
    try {
      String tableClassName = getTableName();

      argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape, encloseRequired);
      SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true);

      CompilationManager compileMgr = new CompilationManager(opts);
      String jarFileName = compileMgr.getJarFilename();

      // Make sure the user's class is loaded into our address space.
      prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName);

      JobConf job = new JobConf();
      job.setJar(jarFileName);

      // Tell the job what class we're testing.
      job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName);

      // use local mode in the same JVM.
      ConfigurationHelper.setJobtrackerAddr(job, "local");
      if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
      }
      String warehouseDir = getWarehouseDir();
      Path warehousePath = new Path(warehouseDir);
      Path inputPath = new Path(warehousePath, getTableName());
      Path outputPath = new Path(warehousePath, getTableName() + "-out");

      job.setMapperClass(ReparseMapper.class);
      job.setNumReduceTasks(0);
      FileInputFormat.addInputPath(job, inputPath);
      FileOutputFormat.setOutputPath(job, outputPath);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(NullWritable.class);

      JobClient.runJob(job);
    } catch (InvalidOptionsException ioe) {
      fail(ioe.toString());
    } catch (ParseException pe) {
      fail(pe.toString());
    } finally {
      if (null != prevClassLoader) {
        ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
      }
    }
  }
Пример #21
0
  public static void main(String[] args) throws Exception {
    String input = "hdfs://192.168.0.110:9000/input/access.log";
    String output = "hdfs://192.168.0.110:9000/user/hdfs/pv";

    JobConf conf = new JobConf(KPIPV.class);
    conf.setJobName("KPIPV");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(KPIPVMapper.class);
    conf.setCombinerClass(KPIPVReducer.class);
    conf.setReducerClass(KPIPVReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);
    System.exit(0);
  }
  public static void runJob(String[] args) {
    JobConf conf = new JobConf(CassandraBulkLoader.class);

    if (args.length >= 4) {
      conf.setNumReduceTasks(new Integer(args[3]));
    }

    try {
      // We store the cassandra storage-conf.xml on the HDFS cluster
      DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf);
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setJobName("CassandraBulkLoader_v2");
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[1]));
    FileOutputFormat.setOutputPath(conf, new Path(args[2]));
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  public int run(String[] args) throws Exception {

    if (args.length < 2) {
      printUsage();
      return 1;
    }

    JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
    job.setJobName("MultiFileWordCount");

    // set the InputFormat of the job to our InputFormat
    job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    // use the defined mapper
    job.setMapperClass(MapClass.class);
    // use the WordCount Reducer
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);

    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (conf == null) {
      return -1;
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);

    conf.setPartitionerClass(TotalOrderPartitioner.class);

    InputSampler.Sampler<IntWritable, Text> sampler =
        new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10);

    Path input = FileInputFormat.getInputPaths(conf)[0];
    input = input.makeQualified(input.getFileSystem(conf));

    Path partitionFile = new Path(input, "_partitions");
    TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
    InputSampler.writePartitionFile(conf, sampler);

    // Add to DistributedCache
    URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
    DistributedCache.addCacheFile(partitionUri, conf);
    DistributedCache.createSymlink(conf);

    JobClient.runJob(conf);
    return 0;
  }
Пример #25
0
  public RunningJob run(String inputPath, String outputPath) throws Exception {

    JobConf conf = new JobConf(BuildIndex.class);
    conf.setJobName("BuildIndex");

    FileInputFormat.addInputPath(conf, new Path(inputPath)); // multiple path
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(LongWritable.class);
    conf.setMapOutputValueClass(LongWritable.class);
    conf.set("delim", delim);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setInt("keyFieldIndexTwo", keyFieldIndexTwo);
    conf.setMapperClass(BuildIndexMapper.class);
    conf.setNumReduceTasks(1);
    conf.setReducerClass(BuildIndexReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    // conf.setInputFormat(CustomInputFormat.class);
    // FileOutputFormat.setCompressOutput(conf,true);
    // delete the output directory if it exists already

    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }
  public void runMR(String myMultiLocs, String sortKey)
      throws ParseException, IOException, Exception, org.apache.hadoop.zebra.parser.ParseException {

    JobConf jobConf = new JobConf(conf);
    jobConf.setJobName("TestMultipleOutputs4");
    jobConf.setJarByClass(TestMultipleOutputs4.class);
    jobConf.set("table.output.tfile.compression", "gz");
    jobConf.set("sortKey", sortKey);
    // input settings
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(TestMultipleOutputs4.MapClass.class);
    jobConf.setMapOutputKeyClass(BytesWritable.class);
    jobConf.setMapOutputValueClass(ZebraTuple.class);
    FileInputFormat.setInputPaths(jobConf, inputPath);

    jobConf.setNumMapTasks(1);

    // output settings

    jobConf.setOutputFormat(BasicTableOutputFormat.class);
    BasicTableOutputFormat.setMultipleOutputs(
        jobConf, myMultiLocs, TestMultipleOutputs4.OutputPartitionerClass.class);

    // set the logical schema with 2 columns
    BasicTableOutputFormat.setSchema(jobConf, "word:string, count:int");
    // for demo purposes, create 2 physical column groups
    BasicTableOutputFormat.setStorageHint(jobConf, "[word];[count]");
    BasicTableOutputFormat.setSortInfo(jobConf, sortKey);
    System.out.println("in runMR, sortkey: " + sortKey);
    // set map-only job.
    jobConf.setNumReduceTasks(1);
    JobClient.runJob(jobConf);
    BasicTableOutputFormat.close(jobConf);
  }
Пример #27
0
  /**
   * {@inheritDoc}
   *
   * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
   */
  @Override
  public int run(String[] args) throws Exception {
    JobConf configuration = new JobConf(getConf(), WordCountExtended.class);
    configuration.setJobName(JOB_NAME);

    configuration.setOutputKeyClass(Text.class);
    configuration.setOutputValueClass(IntWritable.class);

    configuration.setMapperClass(Map.class);
    configuration.setCombinerClass(Reduce.class);
    configuration.setReducerClass(Reduce.class);

    configuration.setInputFormat(TextInputFormat.class);
    configuration.setOutputFormat(TextOutputFormat.class);

    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      if (JOB_SKIP_ARGUMENT.equals(args[i])) {
        DistributedCache.addCacheFile(new Path(args[++i]).toUri(), configuration);
        configuration.setBoolean(JOB_PARAMETER_SKIP_PATTERNS, true);
      } else {
        otherArgs.add(args[i]);
      }
    }

    FileInputFormat.setInputPaths(configuration, new Path(otherArgs.get(0)));
    FileOutputFormat.setOutputPath(configuration, new Path(otherArgs.get(1)));

    JobClient.runJob(configuration);
    return 0;
  }
  @Override
  public void runStep(Path outputPathLocation) throws IOException {

    ImmutableList<Path> pathList =
        ImmutableList.of(
            getOutputDirForStep(PartitionCrawlDBStep.class),
            getOutputDirForStep(GenFeedUrlsStep.class),
            getOutputDirForStep(GenHomepageUrlsStep.class),
            getOutputDirForStep(GenBlogPlatformUrlsStep.class),
            getOutputDirForStep(PartitionCrawlStatsStep.class),
            getOutputDirForStep(PartitionRedirectDataStep.class));

    JobConf jobConf =
        new JobBuilder("BundleWriter Step", getConf())
            .inputs(pathList)
            .inputFormat(MultiFileMergeInputFormat.class)
            .mapperKeyValue(IntWritable.class, Text.class)
            .outputKeyValue(SegmentGeneratorBundleKey.class, SegmentGeneratorItemBundle.class)
            .outputFormat(SequenceFileOutputFormat.class)
            .reducer(GenBundlesStep.class, false)
            .partition(MultiFileMergePartitioner.class)
            .numReducers(CrawlListGeneratorTask.NUM_SHARDS)
            .speculativeExecution(false)
            .output(outputPathLocation)
            .compressMapOutput(false)
            .compressor(CompressionType.BLOCK, SnappyCodec.class)
            .build();

    LOG.info("Starting JOB");
    JobClient.runJob(jobConf);
    LOG.info("Finsihed JOB");
  }
  public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: Compressible");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    // JobConf conf = new JobConf(Stats.class);
    JobConf conf = new JobConf(Compressible.class);
    conf.setJobName("Compressible " + inputPath);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CompressibleMapper.class);
    conf.setReducerClass(CompressibleReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }
  public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf(AccessProcessJob.class);
    conf.set(nameNode, hdfsURL);
    conf.setJobName("AccessProcessJob");
    Job job = Job.getInstance(conf, "AccessProcessJob");

    new Path(outputPath).getFileSystem(conf).delete(new Path(outputPath), true);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(AccessProcessMap.class);
    conf.setReducerClass(AccessProcessReduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    JobClient.runJob(conf);
  }