Java TextOutputFormat 예제들, org.apache.hadoop.mapreduce.lib.output.TextOutputFormat Java 예제들

예제 #1

0

파일 보기

파일: TestLzoTextInputFormat.java 프로젝트: huagetai/elephant-bird

  /**
   * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5
   * that too, to verify that it all went ok.
   *
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput)
      throws IOException, NoSuchAlgorithmException, InterruptedException {

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf));

    FileSystem.getLocal(conf).close(); // remove cached filesystem (if any)
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir_, true);
    localFs.mkdirs(outputDir_);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir_);

    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
      Path lzoFile = new Path(outputDir_, lzoFileName_);
      LzoIndex.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir_);

    List<InputSplit> is = inputFormat.getSplits(job);
    // verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr =
          inputFormat.createRecordReader(inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);

      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();

        md5_.update(value.getBytes(), 0, value.getLength());
      }

      rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5_.digest()));
  }

예제 #2

0

파일 보기

파일: DeliverFormatForUVMR.java 프로젝트: wisgood/mobile-core

  public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    GenericOptionsParser optionparser = new GenericOptionsParser(conf, args);
    conf = optionparser.getConfiguration();

    Job job = new Job(conf, conf.get("job_name"));
    job.setJarByClass(DeliverFormatForUVMR.class);
    FileInputFormat.addInputPaths(job, conf.get("input_dir"));
    String outputDir = conf.get("output_dir");
    String tmpDir = outputDir + "_tmp";
    Path tmpOut = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, tmpOut);
    tmpOut.getFileSystem(conf).delete(tmpOut, true);

    job.setMapperClass(DeliverFormatForUVMapper.class);
    job.setCombinerClass(DeliverFormatForUVCombiner.class);
    job.setReducerClass(DeliverFormatForUVReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(conf.getInt("reduce_num", 20));

    int code = job.waitForCompletion(true) ? 0 : 1;

    if (code == 0) {

      // this job is for combining  small files into one
      Job combineJob = new Job(conf, "CombineTmpData");
      combineJob.setJarByClass(DeliverFormatForUVMR.class);

      FileInputFormat.addInputPath(combineJob, new Path(tmpDir));
      FileOutputFormat.setOutputPath(combineJob, new Path(outputDir));
      combineJob.setMapperClass(IdentityMapper.class);
      combineJob.setReducerClass(IdentityReducer.class);

      combineJob.setInputFormatClass(KeyValueTextInputFormat.class);
      combineJob.setOutputFormatClass(TextOutputFormat.class);

      combineJob.setOutputKeyClass(Text.class);
      combineJob.setOutputValueClass(Text.class);

      TextOutputFormat.setCompressOutput(combineJob, true);
      TextOutputFormat.setOutputCompressorClass(combineJob, LzopCodec.class);

      combineJob.setNumReduceTasks(1);
      code = combineJob.waitForCompletion(true) ? 0 : 1;
    }

    FileSystem.get(conf).delete(tmpOut, true);
    LzoIndexer lzoIndexer = new LzoIndexer(conf);
    lzoIndexer.index(new Path(outputDir));
    System.exit(code);
    return code;
  }

예제 #3

0

파일 보기

파일: HCatMapReduceTest.java 프로젝트: hadoop-zuiwanyuan/hive

  /**
   * Run a local map reduce job to read records from HCatalog table and verify if the count is as
   * expected
   *
   * @param readCount
   * @param filter
   * @return
   * @throws Exception
   */
  List<HCatRecord> runMRRead(int readCount, String filter) throws Exception {
    MapRead.readCount = 0;
    readRecords.clear();

    Configuration conf = new Configuration();
    conf.set(HiveConf.ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname, "true");
    Job job = new Job(conf, "hcat mapreduce read test");
    job.setJarByClass(this.getClass());
    job.setMapperClass(HCatMapReduceTest.MapRead.class);

    // input/output settings
    job.setInputFormatClass(HCatInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    HCatInputFormat.setInput(job, dbName, tableName, filter);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceOutput");
    if (fs.exists(path)) {
      fs.delete(path, true);
    }

    TextOutputFormat.setOutputPath(job, path);

    job.waitForCompletion(true);
    Assert.assertEquals(readCount, MapRead.readCount);

    return readRecords;
  }

예제 #4

0

파일 보기

파일: CombinerAnswer.java 프로젝트: kevinjlsun/HI-labs

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.println("usage : need <input path>  <output path>");
      return 1;
    }
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);

    Configuration conf = getConf();

    Job job = new Job(conf, getClass().getName() + "--answer");
    job.setJarByClass(CombinerAnswer.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // job.setCombinerClass(MyReducer.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setMapOutputKeyClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextInputFormat.setInputPaths(job, inputPath);
    TextOutputFormat.setOutputPath(job, outputPath);

    return job.waitForCompletion(true) ? 0 : 1;
  }

예제 #5

0

파일 보기

파일: TestLzoTextInputFormat.java 프로젝트: huagetai/elephant-bird

  /**
   * Creates an lzo file with random data.
   *
   * @param outputDir Output directory.
   * @param fs File system we're using.
   * @param attemptContext Task attempt context, contains task id etc.
   * @throws IOException
   * @throws InterruptedException
   */
  private byte[] createTestInput(
      Path outputDir, FileSystem fs, TaskAttemptContext attemptContext, int charsToOutput)
      throws IOException, InterruptedException {

    TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>();
    RecordWriter<Text, Text> rw = null;

    md5_.reset();

    try {
      rw = output.getRecordWriter(attemptContext);

      char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6".toCharArray();

      Random r = new Random(System.currentTimeMillis());
      Text key = new Text();
      Text value = new Text();
      int charsMax = chars.length - 1;
      for (int i = 0; i < charsToOutput; ) {
        i += fillText(chars, r, charsMax, key);
        i += fillText(chars, r, charsMax, value);
        rw.write(key, value);
        md5_.update(key.getBytes(), 0, key.getLength());
        // text output format writes tab between the key and value
        md5_.update("\t".getBytes("UTF-8"));
        md5_.update(value.getBytes(), 0, value.getLength());
      }
    } finally {
      if (rw != null) {
        rw.close(attemptContext);
        OutputCommitter committer = output.getOutputCommitter(attemptContext);
        committer.commitTask(attemptContext);
        committer.cleanupJob(attemptContext);
      }
    }

    byte[] result = md5_.digest();
    md5_.reset();
    return result;
  }

예제 #6

0

파일 보기

파일: Verify.java 프로젝트: renato2099/GoraCI-DynamoDB

  @SuppressWarnings({"unchecked", "rawtypes"})
  public void start(
      Path outputDir, int numReducers, boolean concurrent, String accessKey, String secretKey)
      throws GoraException, IOException, Exception {
    LOG.info("Running Verify with outputDir=" + outputDir + ", numReducers=" + numReducers);

    // DataStore<Long,CINode> store = DataStoreFactory.getDataStore(Long.class, CINode.class, new
    // Configuration());
    auth = new BasicAWSCredentials(accessKey, secretKey);

    DataStore<Long, cidynamonode> store =
        WSDataStoreFactory.createDataStore(
            DynamoDBStore.class, DynamoDBKey.class, cidynamonode.class, auth);

    job = new Job(getConf());

    if (!job.getConfiguration()
        .get("io.serializations")
        .contains("org.apache.hadoop.io.serializer.JavaSerialization")) {
      job.getConfiguration()
          .set(
              "io.serializations",
              job.getConfiguration().get("io.serializations")
                  + ",org.apache.hadoop.io.serializer.JavaSerialization");
    }

    job.setJobName("Link Verifier");
    job.setNumReduceTasks(numReducers);
    job.setJarByClass(getClass());

    Query query = store.newQuery();
    // if (!concurrent) {
    // no concurrency filtering, only need prev field
    // query.setFields("prev");
    // } else {
    // readFlushed(job.getCon  figuration());
    // }

    GoraMapper.initMapperJob(
        job, query, store, DynamoDBKey.class, VLongWritable.class, VerifyMapper.class, true);

    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);

    job.setReducerClass(VerifyReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputDir);

    store.close();

    job.submit();
  }

예제 #7

0

파일 보기

파일: MapsideJoinDriver.java 프로젝트: swapnaraja/hadoopCode

 public static void main(String[] args)
     throws IOException, ClassNotFoundException, InterruptedException {
   // TODO Auto-generated method stub
   Configuration conf = new Configuration();
   conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ",");
   Job job = Job.getInstance(conf);
   job.setJarByClass(MapsideJoinDriver.class);
   job.setMapperClass(MapsideJoinMapper.class);
   job.setInputFormatClass(CompositeInputFormat.class);
   String expr =
       CompositeInputFormat.compose(
           "inner", KeyValueTextInputFormat.class, new Path(args[0]), new Path(args[1]));
   // String expr = CompositeInputFormat.compose("outer",KeyValueTextInputFormat.class , new
   // Path(args[0]),new Path(args[1]));
   job.getConfiguration().set("mapreduce.join.expr", expr);
   job.setNumReduceTasks(0);
   TextOutputFormat.setOutputPath(job, new Path(args[2]));
   job.waitForCompletion(true);
 }

예제 #8

0

파일 보기

파일: ReplicatedJoinDriver.java 프로젝트: skanthv/MapReduceCodeExamples

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 4) {
      System.err.println(
          "Usage: ReplicatedJoin <user data> <comment data> <out> [inner|leftouter]");
      System.exit(1);
    }

    String joinType = otherArgs[3];
    if (!(joinType.equalsIgnoreCase("inner") || joinType.equalsIgnoreCase("leftouter"))) {
      System.err.println("Join type not set to inner or leftouter");
      System.exit(2);
    }

    // Configure the join type
    Job job = new Job(conf, "Replicated Join");
    job.getConfiguration().set("join.type", joinType);
    job.setJarByClass(ReplicatedJoinDriver.class);

    job.setMapperClass(ReplicatedJoinMapper.class);
    job.setNumReduceTasks(0);

    TextInputFormat.setInputPaths(job, new Path(otherArgs[1]));
    TextOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Configure the DistributedCache
    DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration());

    DistributedCache.setLocalFiles(job.getConfiguration(), otherArgs[0]);

    System.exit(job.waitForCompletion(true) ? 0 : 3);
  }