Java TextInputFormat示例，org.apache.hadoop.mapreduce.lib.input.TextInputFormat Java示例

示例#1

0

显示文件

文件： ClusterHdfsSource.java 项目： madhukard/datacollector

 private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
     throws IOException, InterruptedException {
   TextInputFormat textInputFormat = new TextInputFormat();
   long fileLength = fileStatus.getLen();
   // Hadoop does unsafe casting from long to int, so split length should not be greater than int
   // max value
   long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE;
   InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null);
   TaskAttemptContext taskAttemptContext =
       new TaskAttemptContextImpl(
           hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));
   RecordReader<LongWritable, Text> recordReader =
       textInputFormat.createRecordReader(fileSplit, taskAttemptContext);
   recordReader.initialize(fileSplit, taskAttemptContext);
   boolean hasNext = recordReader.nextKeyValue();
   List<Map.Entry> batch = new ArrayList<>();
   while (hasNext && batch.size() < batchSize) {
     batch.add(
         new Pair(
             fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(),
             String.valueOf(recordReader.getCurrentValue())));
     hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
   }
   return batch;
 }

示例#2

0

显示文件

文件： PartialBuilder.java 项目： hmcl/mahout

  @Override
  protected void configureJob(Job job) throws IOException {
    Configuration conf = job.getConfiguration();

    job.setJarByClass(PartialBuilder.class);

    FileInputFormat.setInputPaths(job, getDataPath());
    FileOutputFormat.setOutputPath(job, getOutputPath(conf));

    job.setOutputKeyClass(TreeID.class);
    job.setOutputValueClass(MapredOutput.class);

    job.setMapperClass(Step1Mapper.class);
    job.setNumReduceTasks(0); // no reducers

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    // For this implementation to work, mapred.map.tasks needs to be set to the actual
    // number of mappers Hadoop will use:
    TextInputFormat inputFormat = new TextInputFormat();
    List<?> splits = inputFormat.getSplits(job);
    if (splits == null || splits.isEmpty()) {
      log.warn("Unable to compute number of splits?");
    } else {
      int numSplits = splits.size();
      log.info("Setting mapred.map.tasks = {}", numSplits);
      conf.setInt("mapred.map.tasks", numSplits);
    }
  }

示例#3

0

显示文件

文件： WriteUsingMR.java 项目： ajay0221/parquet-mr

  public Path write(Message... messages) throws Exception {

    synchronized (WriteUsingMR.class) {
      outputPath = TestUtils.someTemporaryFilePath();

      Path inputPath = TestUtils.someTemporaryFilePath();
      FileSystem fileSystem = inputPath.getFileSystem(conf);
      fileSystem.create(inputPath);

      inputMessages = Collections.unmodifiableList(Arrays.asList(messages));

      final Job job = new Job(conf, "write");

      // input not really used
      TextInputFormat.addInputPath(job, inputPath);
      job.setInputFormatClass(TextInputFormat.class);

      job.setMapperClass(WritingMapper.class);
      job.setNumReduceTasks(0);

      job.setOutputFormatClass(ProtoParquetOutputFormat.class);
      ProtoParquetOutputFormat.setOutputPath(job, outputPath);
      ProtoParquetOutputFormat.setProtobufClass(job, TestUtils.inferRecordsClass(messages));

      waitForJob(job);

      inputMessages = null;
      return outputPath;
    }
  }

示例#4

0

显示文件

文件： TokenFileWordCount.java 项目： matthew-dailey/accumulo

  @Override
  public int run(String[] args) throws Exception {

    String instance = args[0];
    String zookeepers = args[1];
    String user = args[2];
    String tokenFile = args[3];
    String input = args[4];
    String tableName = args[5];

    Job job = Job.getInstance(getConf());
    job.setJobName(TokenFileWordCount.class.getName());
    job.setJarByClass(this.getClass());

    job.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.setInputPaths(job, input);

    job.setMapperClass(MapClass.class);

    job.setNumReduceTasks(0);

    job.setOutputFormatClass(AccumuloOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Mutation.class);

    // AccumuloInputFormat not used here, but it uses the same functions.
    AccumuloOutputFormat.setZooKeeperInstance(
        job, ClientConfiguration.loadDefault().withInstance(instance).withZkHosts(zookeepers));
    AccumuloOutputFormat.setConnectorInfo(job, user, tokenFile);
    AccumuloOutputFormat.setCreateTables(job, true);
    AccumuloOutputFormat.setDefaultTableName(job, tableName);

    job.waitForCompletion(true);
    return 0;
  }

示例#5

0

显示文件

文件： TestLzoTextInputFormat.java 项目： huagetai/elephant-bird

  /**
   * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5
   * that too, to verify that it all went ok.
   *
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput)
      throws IOException, NoSuchAlgorithmException, InterruptedException {

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf));

    FileSystem.getLocal(conf).close(); // remove cached filesystem (if any)
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir_, true);
    localFs.mkdirs(outputDir_);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir_);

    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
      Path lzoFile = new Path(outputDir_, lzoFileName_);
      LzoIndex.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir_);

    List<InputSplit> is = inputFormat.getSplits(job);
    // verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr =
          inputFormat.createRecordReader(inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);

      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();

        md5_.update(value.getBytes(), 0, value.getLength());
      }

      rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5_.digest()));
  }

示例#6

0

显示文件

文件： CombinerAnswer.java 项目： kevinjlsun/HI-labs

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.println("usage : need <input path>  <output path>");
      return 1;
    }
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);

    Configuration conf = getConf();

    Job job = new Job(conf, getClass().getName() + "--answer");
    job.setJarByClass(CombinerAnswer.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // job.setCombinerClass(MyReducer.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setMapOutputKeyClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextInputFormat.setInputPaths(job, inputPath);
    TextOutputFormat.setOutputPath(job, outputPath);

    return job.waitForCompletion(true) ? 0 : 1;
  }

示例#7

0

显示文件

文件： RedisOutputDriver.java 项目： ccmicky/mapreducepatterns

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 3) {
      System.err.println("Usage: RedisOutput <user data> <redis hosts> <hash name>");
      System.exit(1);
    }

    Path inputPath = new Path(otherArgs[0]);
    String hosts = otherArgs[1];
    String hashName = otherArgs[2];

    Job job = new Job(conf, "Redis Output");
    job.setJarByClass(RedisOutputDriver.class);

    job.setMapperClass(RedisOutputMapper.class);
    job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.setInputPaths(job, inputPath);

    job.setOutputFormatClass(RedisHashOutputFormat.class);
    RedisHashOutputFormat.setRedisHosts(job, hosts);
    RedisHashOutputFormat.setRedisHashKey(job, hashName);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    int code = job.waitForCompletion(true) ? 0 : 2;

    System.exit(code);
  }

示例#8

0

显示文件

文件： BulkIngestExample.java 项目： joshelser/accumulo

  @Override
  public int run(String[] args) {
    Opts opts = new Opts();
    opts.parseArgs(BulkIngestExample.class.getName(), args);

    Configuration conf = getConf();
    PrintStream out = null;
    try {
      Job job = JobUtil.getJob(conf);
      job.setJobName("bulk ingest example");
      job.setJarByClass(this.getClass());

      job.setInputFormatClass(TextInputFormat.class);

      job.setMapperClass(MapClass.class);
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(Text.class);

      job.setReducerClass(ReduceClass.class);
      job.setOutputFormatClass(AccumuloFileOutputFormat.class);
      opts.setAccumuloConfigs(job);

      Connector connector = opts.getConnector();

      TextInputFormat.setInputPaths(job, new Path(opts.inputDir));
      AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files"));

      FileSystem fs = FileSystem.get(conf);
      out =
          new PrintStream(
              new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt"))));

      Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100);
      for (Text split : splits)
        out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split))));

      job.setNumReduceTasks(splits.size() + 1);
      out.close();

      job.setPartitionerClass(RangePartitioner.class);
      RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt");

      job.waitForCompletion(true);
      Path failures = new Path(opts.workDir, "failures");
      fs.delete(failures, true);
      fs.mkdirs(new Path(opts.workDir, "failures"));
      connector
          .tableOperations()
          .importDirectory(
              opts.getTableName(), opts.workDir + "/files", opts.workDir + "/failures", false);

    } catch (Exception e) {
      throw new RuntimeException(e);
    } finally {
      if (out != null) out.close();
    }

    return 0;
  }

示例#9

0

显示文件

文件： HbaseMR.java 项目： chinahuangxin/prjs

 public int run(String[] args) throws Exception {
   String input = args[0];
   Job job = new Job(conf, "jobName");
   job.setJarByClass(HbaseMR.class);
   job.setMapperClass(Map.class);
   job.setNumReduceTasks(0);
   job.setInputFormatClass(TextInputFormat.class);
   TextInputFormat.setInputPaths(job, input);
   job.setOutputFormatClass(NullOutputFormat.class);
   return job.waitForCompletion(true) ? 0 : 1;
 }

示例#10

0

显示文件

文件： NGramIngest.java 项目： jpmcnamee/ACCUMULO-2291

  /** @param args */
  @Override
  public int run(String[] args) throws Exception {
    Opts opts = new Opts();
    opts.parseArgs(getClass().getName(), args);

    Job job = JobUtil.getJob(getConf());
    job.setJobName(getClass().getSimpleName());
    job.setJarByClass(getClass());

    opts.setAccumuloConfigs(job);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    job.setMapperClass(NGramMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Mutation.class);

    job.setNumReduceTasks(0);
    job.setSpeculativeExecution(false);

    if (!opts.getConnector().tableOperations().exists(opts.tableName)) {
      log.info("Creating table " + opts.tableName);
      opts.getConnector().tableOperations().create(opts.tableName);
      SortedSet<Text> splits = new TreeSet<Text>();
      String numbers[] = "1 2 3 4 5 6 7 8 9".split("\\s");
      String lower[] = "a b c d e f g h i j k l m n o p q r s t u v w x y z".split("\\s");
      String upper[] = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split("\\s");
      for (String[] array : new String[][] {numbers, lower, upper}) {
        for (String s : array) {
          splits.add(new Text(s));
        }
      }
      opts.getConnector().tableOperations().addSplits(opts.tableName, splits);
    }

    TextInputFormat.addInputPath(job, new Path(opts.inputDirectory));
    job.waitForCompletion(true);
    return job.isSuccessful() ? 0 : 1;
  }

示例#11

0

显示文件

文件： ReplicatedJoinDriver.java 项目： skanthv/MapReduceCodeExamples

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 4) {
      System.err.println(
          "Usage: ReplicatedJoin <user data> <comment data> <out> [inner|leftouter]");
      System.exit(1);
    }

    String joinType = otherArgs[3];
    if (!(joinType.equalsIgnoreCase("inner") || joinType.equalsIgnoreCase("leftouter"))) {
      System.err.println("Join type not set to inner or leftouter");
      System.exit(2);
    }

    // Configure the join type
    Job job = new Job(conf, "Replicated Join");
    job.getConfiguration().set("join.type", joinType);
    job.setJarByClass(ReplicatedJoinDriver.class);

    job.setMapperClass(ReplicatedJoinMapper.class);
    job.setNumReduceTasks(0);

    TextInputFormat.setInputPaths(job, new Path(otherArgs[1]));
    TextOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Configure the DistributedCache
    DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration());

    DistributedCache.setLocalFiles(job.getConfiguration(), otherArgs[0]);

    System.exit(job.waitForCompletion(true) ? 0 : 3);
  }

示例#12

0

显示文件

文件： MyInputFormat.java 项目： radi9/mycrawler

  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    return textIF.getSplits(context);
  }

示例#13

0

显示文件

文件： HCatMapReduceTest.java 项目： hadoop-zuiwanyuan/hive

  /**
   * Run a local map reduce job to load data from in memory records to an HCatalog Table
   *
   * @param partitionValues
   * @param partitionColumns
   * @param records data to be written to HCatalog table
   * @param writeCount
   * @param assertWrite
   * @param asSingleMapTask
   * @return
   * @throws Exception
   */
  Job runMRCreate(
      Map<String, String> partitionValues,
      List<HCatFieldSchema> partitionColumns,
      List<HCatRecord> records,
      int writeCount,
      boolean assertWrite,
      boolean asSingleMapTask,
      String customDynamicPathPattern)
      throws Exception {

    writeRecords = records;
    MapCreate.writeCount = 0;

    Configuration conf = new Configuration();
    Job job = new Job(conf, "hcat mapreduce write test");
    job.setJarByClass(this.getClass());
    job.setMapperClass(HCatMapReduceTest.MapCreate.class);

    // input/output settings
    job.setInputFormatClass(TextInputFormat.class);

    if (asSingleMapTask) {
      // One input path would mean only one map task
      Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput");
      createInputFile(path, writeCount);
      TextInputFormat.setInputPaths(job, path);
    } else {
      // Create two input paths so that two map tasks get triggered. There could be other ways
      // to trigger two map tasks.
      Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput");
      createInputFile(path, writeCount / 2);

      Path path2 = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput2");
      createInputFile(path2, (writeCount - writeCount / 2));

      TextInputFormat.setInputPaths(job, path, path2);
    }

    job.setOutputFormatClass(HCatOutputFormat.class);

    OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName, partitionValues);
    if (customDynamicPathPattern != null) {
      job.getConfiguration()
          .set(HCatConstants.HCAT_DYNAMIC_CUSTOM_PATTERN, customDynamicPathPattern);
    }
    HCatOutputFormat.setOutput(job, outputJobInfo);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(DefaultHCatRecord.class);

    job.setNumReduceTasks(0);

    HCatOutputFormat.setSchema(job, new HCatSchema(partitionColumns));

    boolean success = job.waitForCompletion(true);

    // Ensure counters are set when data has actually been read.
    if (partitionValues != null) {
      assertTrue(
          job.getCounters().getGroup("FileSystemCounters").findCounter("FILE_BYTES_READ").getValue()
              > 0);
    }

    if (!HCatUtil.isHadoop23()) {
      // Local mode outputcommitter hook is not invoked in Hadoop 1.x
      if (success) {
        new FileOutputCommitterContainer(job, null).commitJob(job);
      } else {
        new FileOutputCommitterContainer(job, null).abortJob(job, JobStatus.State.FAILED);
      }
    }
    if (assertWrite) {
      // we assert only if we expected to assert with this call.
      Assert.assertEquals(writeCount, MapCreate.writeCount);
    }

    if (isTableExternal()) {
      externalTableLocation = outputJobInfo.getTableInfo().getTableLocation();
    }

    return job;
  }