Java TextInputFormat示例，org.apache.hadoop.mapred.TextInputFormat Java示例

示例#1

0

显示文件

文件： Step0JobTest.java 项目： maximzhao/Mahout-GSOC-LibLinear

  public void testStep0Mapper() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, numAttributes);
    double[][] source = Utils.randomDoubles(rng, descriptor, numInstances);
    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    JobConf job = new JobConf();
    job.setNumMapTasks(numMaps);

    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Step0OutputCollector collector = new Step0OutputCollector(numMaps);
    Reporter reporter = Reporter.NULL;

    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];
      RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

      Step0Mapper mapper = new Step0Mapper();
      mapper.configure(p);

      Long firstKey = null;
      int size = 0;

      while (reader.next(key, value)) {
        if (firstKey == null) {
          firstKey = key.get();
        }

        mapper.map(key, value, collector, reporter);

        size++;
      }

      mapper.close();

      // validate the mapper's output
      assertEquals(p, collector.keys[p]);
      assertEquals(firstKey.longValue(), collector.values[p].getFirstId());
      assertEquals(size, collector.values[p].getSize());
    }
  }

示例#2

0

显示文件

文件： ManyTableJob.java 项目： babokim/cloudata

  public static void getData(CloudataConf conf, Path keyPath) throws IOException {
    JobConf jobConf = new JobConf(TeraReadJob.class);
    jobConf.set("user.name", conf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    Path tempOutputPath = new Path("ManyTableJob_Get_" + System.currentTimeMillis());

    jobConf.setJobName("ManyTableJob_Get_" + "(" + new Date() + ")");

    TextOutputFormat.setOutputPath(jobConf, tempOutputPath);
    // <MAP>
    jobConf.setMapperClass(ManyTableGetMap.class);
    jobConf.setInputFormat(TextInputFormat.class);
    TextInputFormat.addInputPath(jobConf, keyPath);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    // </MAP>

    // <REDUCE>
    jobConf.setNumReduceTasks(0);
    // </REDUCE>

    try {
      // Run Job
      JobClient.runJob(jobConf);
    } finally {
      // delete temp output path
      FileSystem fs = FileSystem.get(jobConf);
      FileUtil.delete(fs, tempOutputPath, true);
      CloudataMapReduceUtil.clearMapReduce(libDir);
    }
  }

示例#3

0

显示文件

文件： TextRecordParser.java 项目： brockn/IterativeReduce

 @Override
 public void reset() {
   // TODO Auto-generated method stub
   try {
     this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter);
   } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
 }

示例#4

0

显示文件

文件： BuildPageRankRecords.java 项目： kensk8er/MapReduceAssignment

  /** Runs this tool. */
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int n = Integer.parseInt(args[2]);

    sLogger.info("Tool name: BuildPageRankRecords");
    sLogger.info(" - inputDir: " + inputPath);
    sLogger.info(" - outputDir: " + outputPath);
    sLogger.info(" - numNodes: " + n);

    JobConf conf = new JobConf(BuildPageRankRecords.class);
    conf.setJobName("PackageLinkGraph");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    conf.setInt("NodeCnt", n);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    TextInputFormat.addInputPath(conf, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
  }

示例#5

0

显示文件

文件： TextRecordParser.java 项目： brockn/IterativeReduce

  @Override
  public void setFile(String file, long offset, long length) {
    JobConf defaultConf = new JobConf();
    this.split = new FileSplit(new Path(file), offset, length, defaultConf);

    this.jobConf = defaultConf;
    // this.split = split;
    this.input_format = new TextInputFormat();

    try {
      this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    this.key = reader.createKey();
  }

示例#6

0

显示文件

文件： FileTest.java 项目： RyanFu/shopJob

  public int run(String[] args) throws Exception {
    if (args.length < 4) {
      System.out.println("ERROR: Please Enter args : input output type(text|seq) splitChar(9=\t)");
      return JobClient.SUCCESS;
    }
    String input = args[0];
    String output = args[1];
    String type = args[2];
    String splitChar = args[3];

    JobConf config = new JobConf(getConf(), getClass());
    config.set("user.split", splitChar);

    config.setJobName("File Filter -" + System.currentTimeMillis());
    config.setNumReduceTasks(10);
    config.setReducerClass(IdentityReducer.class);
    config.setMapperClass(FileTestMapper.class);
    if ("text".equals(type)) {
      config.setInputFormat(TextInputFormat.class);
      TextInputFormat.addInputPath(config, new Path(input));
    } else {
      config.setInputFormat(SequenceFileInputFormat.class);
      SequenceFileInputFormat.addInputPath(config, new Path(input));
    }
    config.setMapOutputKeyClass(Text.class);
    config.setMapOutputValueClass(Text.class);

    config.setOutputKeyClass(Text.class);
    config.setOutputValueClass(Text.class);

    // if output path exists then return
    FileSystem fs = FileSystem.get(config);
    Path outputPath = new Path(output);
    FileOutputFormat.setOutputPath(config, outputPath);

    if (!fs.exists(outputPath)) {
      JobClient.runJob(config);
    } else {
      System.out.println("You has finished this job today ! " + outputPath);
    }

    return JobClient.SUCCESS;
  }

示例#7

0

显示文件

文件： UserViewMuliHostStepThreeGroup.java 项目： RyanFu/shopJob

  public int run(String[] args) throws Exception {
    if (args.length < 1) {
      args = new String[] {DateStringUtils.now()};
      System.out.println(
          "ERROR: Please Enter Date , eg. 20101010 ! now use default => " + DateStringUtils.now());
    }

    JobConf config = new JobConf(getConf(), getClass());
    config.set("user.args", Utils.asString(args));

    config.setJobName(getClass() + "-" + System.currentTimeMillis());
    config.setNumReduceTasks(100);
    config.setMapperClass(getClass());
    config.setReducerClass(getClass());
    config.setInputFormat(getInputFormat());
    config.setMapOutputKeyClass(Text.class);
    config.setMapOutputValueClass(Text.class);

    // add input paths
    for (String path : getInputPath(args)) {
      if (TextInputFormat.class.equals(getInputFormat())) {
        TextInputFormat.addInputPath(config, new Path(path));
      } else if (SequenceFileInputFormat.class.equals(getInputFormat())) {
        SequenceFileInputFormat.addInputPath(config, new Path(path));
      }
    }

    config.setOutputKeyClass(Text.class);
    config.setOutputValueClass(Text.class);

    // if output path exists then return
    FileSystem fs = FileSystem.get(config);
    Path outputPath = new Path(getOutputPath(args));
    FileOutputFormat.setOutputPath(config, outputPath);

    if (!fs.exists(outputPath)) {
      JobClient.runJob(config);
    } else {
      System.out.println("You has finished this job today ! " + outputPath);
    }

    return JobClient.SUCCESS;
  }

示例#8

0

显示文件

文件： NewItemDailyFrom.java 项目： RyanFu/shopJob

  public int run(String[] args) throws Exception {

    if (args.length < 1) {
      args = new String[] {TaobaoPath.now()};
      System.out.println(
          "ERROR: Please Enter Date , eg. 20100507  now use default!" + TaobaoPath.now());
    }

    JobConf conf = new JobConf(getConf(), NewItemDailyFrom.class);
    conf.setJobName("NewItemDailyFrom-" + System.currentTimeMillis());

    String date = args[0];
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(TaobaoPath.getOutput("new_item_daily_from", date))) {
      System.out.println(
          "ERROR: You has finish this job at this day :  "
              + date
              + " [ "
              + TaobaoPath.getOutput("new_item_daily_from", date)
              + " ] ");
      return -1;
    }

    conf.set("user.date", date);
    conf.setNumReduceTasks(1);
    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(LongSumReducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(LongWritable.class);
    TextInputFormat.addInputPath(conf, TaobaoPath.hiveAuctionAuctions(date));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(LongWritable.class);

    FileOutputFormat.setOutputPath(conf, TaobaoPath.getOutput("new_item_daily_from", date));

    JobClient.runJob(conf);

    return JobClient.SUCCESS;
  }

示例#9

0

显示文件

文件： XMLInputFormat.java 项目： jordanbg/Cloud9

 public void configure(JobConf jobConf) {
   super.configure(jobConf);
 }

示例#10

0

显示文件

文件： Step0JobTest.java 项目： maximzhao/Mahout-GSOC-LibLinear

  public void testProcessOutput() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, numAttributes);
    double[][] source = Utils.randomDoubles(rng, descriptor, numInstances);

    // each instance label is its index in the dataset
    int labelId = Utils.findLabel(descriptor);
    for (int index = 0; index < numInstances; index++) {
      source[index][labelId] = index;
    }

    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    JobConf job = new JobConf();
    job.setNumMapTasks(numMaps);
    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Reporter reporter = Reporter.NULL;

    int[] keys = new int[numMaps];
    Step0Output[] values = new Step0Output[numMaps];

    int[] expectedIds = new int[numMaps];

    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];
      RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

      Long firstKey = null;
      int size = 0;

      while (reader.next(key, value)) {
        if (firstKey == null) {
          firstKey = key.get();
          expectedIds[p] = converter.convert(0, value.toString()).label;
        }

        size++;
      }

      keys[p] = p;
      values[p] = new Step0Output(firstKey, size);
    }

    Step0Output[] partitions = Step0Job.processOutput(keys, values);

    int[] actualIds = Step0Output.extractFirstIds(partitions);

    assertTrue(
        "Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds),
        Arrays.equals(expectedIds, actualIds));
  }