Java JobConf示例，org.apache.hadoop.mapred.JobConf Java示例

示例#1

0

显示文件

文件： SortByTemperatureUsingTotalOrderPartitioner.java 项目： JuneShi0315/hadoop-book

  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (conf == null) {
      return -1;
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);

    conf.setPartitionerClass(TotalOrderPartitioner.class);

    InputSampler.Sampler<IntWritable, Text> sampler =
        new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10);

    Path input = FileInputFormat.getInputPaths(conf)[0];
    input = input.makeQualified(input.getFileSystem(conf));

    Path partitionFile = new Path(input, "_partitions");
    TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
    InputSampler.writePartitionFile(conf, sampler);

    // Add to DistributedCache
    URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
    DistributedCache.addCacheFile(partitionUri, conf);
    DistributedCache.createSymlink(conf);

    JobClient.runJob(conf);
    return 0;
  }

示例#2

0

显示文件

文件： TestTokenCache.java 项目： pwendell/mr2-fairscheduler

  /**
   * run a distributed job and verify that TokenCache is available
   *
   * @throws IOException
   */
  @Test
  public void testTokenCache() throws IOException {

    System.out.println("running dist job");

    // make sure JT starts
    jConf = mrCluster.createJobConf();

    // provide namenodes names for the job to get the delegation tokens for
    String nnUri = dfsCluster.getURI(0).toString();
    jConf.set(MRJobConfig.JOB_NAMENODES, nnUri + "," + nnUri);
    // job tracker principla id..
    jConf.set(JTConfig.JT_USER_NAME, "jt_id/foo@BAR");

    // using argument to pass the file name
    String[] args = {
      "-tokenCacheFile", tokenFileName.toString(), "-m", "1", "-r", "1", "-mt", "1", "-rt", "1"
    };

    int res = -1;
    try {
      res = ToolRunner.run(jConf, new MySleepJob(), args);
    } catch (Exception e) {
      System.out.println("Job failed with" + e.getLocalizedMessage());
      e.printStackTrace(System.out);
      fail("Job failed");
    }
    assertEquals("dist job res is not 0", res, 0);
  }

示例#3

0

显示文件

文件： MapReduceInputFormatWrapper.java 项目： HimanshuBhardwaj/elephant-bird

  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    JobConf jobConf = (JobConf) HadoopCompat.getConfiguration(context);

    initInputFormat(jobConf);

    org.apache.hadoop.mapred.InputSplit[] splits =
        realInputFormat.getSplits(jobConf, jobConf.getNumMapTasks());

    if (splits == null) {
      return null;
    }

    List<InputSplit> resultSplits = new ArrayList<InputSplit>(splits.length);

    for (org.apache.hadoop.mapred.InputSplit split : splits) {
      if (split.getClass() == org.apache.hadoop.mapred.FileSplit.class) {
        org.apache.hadoop.mapred.FileSplit mapredFileSplit =
            ((org.apache.hadoop.mapred.FileSplit) split);
        resultSplits.add(
            new FileSplit(
                mapredFileSplit.getPath(),
                mapredFileSplit.getStart(),
                mapredFileSplit.getLength(),
                mapredFileSplit.getLocations()));
      } else {
        resultSplits.add(new InputSplitWrapper(split));
      }
    }

    return resultSplits;
  }

示例#4

0

显示文件

文件： MaxTemperatureDriver.java 项目： saintstack/hadoop-book

  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    // vv MaxTemperatureDriverV6
    conf.setProfileEnabled(true);
    conf.setProfileParams(
        "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s");
    conf.setProfileTaskRange(true, "0-2");
    // ^^ MaxTemperatureDriverV6

    JobClient.runJob(conf);
    return 0;
  }

示例#5

0

显示文件

文件： JobUtil.java 项目： riteshbagrecha/jumbune

  /**
   * This method call when injected into a class will add the GenericOptionParser with '-libjars'
   * parameter
   *
   * @param job JobConf to which the classpath to be added
   * @param jars comma separated jars to be added to the classpath
   * @param resources comma separated files to be added to the classpath
   * @throws IOException
   */
  public static void addClassPath(final JobConf job, String jars, String resources)
      throws IOException {
    LOGGER.debug(
        "Libraries being added to the classpath: "
            + job.get(JOB_CONF_JARS)
            + "Resources : "
            + job.get(JOB_CONF_RESOURCES));
    // taking libjars and files values passed from console while executing
    // job
    StringBuilder oldJars = new StringBuilder().append(job.get(JOB_CONF_JARS));
    StringBuilder oldFiles = new StringBuilder().append(job.get(JOB_CONF_RESOURCES));
    String jarsTmp = jars;
    String resourcesTmp = resources;
    if (oldJars != null && !oldJars.toString().equals(NULL) && oldJars.length() > 0) {
      oldJars.append(SEPARATOR_COMMA);
      oldJars.append(jarsTmp);
      jarsTmp = oldJars.toString();
    }

    if (resourcesTmp != null && resourcesTmp.length() > 0) {
      if (oldFiles != null && !oldFiles.toString().equals(NULL) && oldFiles.length() > 0) {
        oldFiles.append(SEPARATOR_COMMA);
        oldFiles.append(resourcesTmp);
        resourcesTmp = oldFiles.toString();
      }
      new GenericOptionsParser(
          job, new String[] {GENERIC_PARSER_LIB_JARS, jarsTmp, GENERIC_PARSER_FILES, resourcesTmp});
    } else {
      new GenericOptionsParser(job, new String[] {GENERIC_PARSER_LIB_JARS, jarsTmp});
    }
  }

示例#6

0

显示文件

文件： DistCp.java 项目： neutronsharc/hdfsbackup

  /**
   * Driver to copy srcPath to destPath depending on required protocol.
   *
   * @param args arguments
   */
  static void copy(final Configuration conf, final Arguments args) throws IOException {
    LOG.info("srcPaths=" + args.srcs);
    LOG.info("destPath=" + args.dst);
    checkSrcPath(conf, args.srcs);

    JobConf job = createJobConf(conf);
    if (args.preservedAttributes != null) {
      job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes);
    }
    if (args.mapredSslConf != null) {
      job.set("dfs.https.client.keystore.resource", args.mapredSslConf);
    }

    // Initialize the mapper
    try {
      setup(conf, job, args);
      JobClient.runJob(job);
      finalize(conf, job, args.dst, args.preservedAttributes);
    } finally {
      // delete tmp
      fullyDelete(job.get(TMP_DIR_LABEL), job);
      // delete jobDirectory
      fullyDelete(job.get(JOB_DIR_LABEL), job);
    }
  }

示例#7

0

显示文件

文件： RandomWriter.java 项目： heipacker/wordcount

 /**
  * Generate the requested number of file splits, with the filename set to the filename of the
  * output file.
  */
 public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
   /** 设置输入分片的个数* */
   JobClient client = new JobClient(job);
   ClusterStatus cluster = client.getClusterStatus();
   /** 如果属性不存在 则返回默认的值 * */
   int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
   long numBytesToWritePerMap =
       job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
   if (numBytesToWritePerMap == 0) {
     System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
   }
   long totalBytesToWrite =
       job.getLong(
           "test.randomwrite.total_bytes",
           numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
   int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
   if (numMaps == 0 && totalBytesToWrite > 0) {
     numMaps = 1;
   }
   System.out.println("numMaps-------" + numMaps);
   InputSplit[] result = new InputSplit[numMaps];
   Path outDir = FileOutputFormat.getOutputPath(job);
   for (int i = 0; i < result.length; ++i) {
     result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null);
   }
   return result;
 }

示例#8

0

显示文件

文件： JobBuilder.java 项目： Prasadidasi/commoncrawl-crawler

 public JobBuilder reducer(Class<? extends Reducer> reducer, boolean hasCombiner)
     throws IOException {
   if (reducer != IdentityReducer.class) _jobConf.setReducerClass(reducer);
   if (hasCombiner) _jobConf.setCombinerClass(reducer);
   _jobConf.setJarByClass(reducer);
   return this;
 }

示例#9

0

显示文件

文件： JobBuilder.java 项目： Prasadidasi/commoncrawl-crawler

 public JobBuilder compressor(CompressionType type, Class<? extends CompressionCodec> codec)
     throws IOException {
   _jobConf.setBoolean("mapred.output.compress", true);
   _jobConf.set("mapred.output.compression.type", type.toString());
   _jobConf.setClass("mapred.output.compression.codec", codec, CompressionCodec.class);
   return this;
 }

示例#10

0

显示文件

文件： RBMMapper.java 项目： dery-hit/MapReduce-Based-Deep-Learning

  public void configure(JobConf conf) {
    /*
     * It reads all the configurations and distributed cache from outside.
     */

    // Read number of nodes in input layer and output layer from configuration
    inputNumdims = conf.get("numdims");
    inputNumhid = conf.get("numhid");

    // Read the weights from distributed cache
    Path[] pathwaysFiles = new Path[0];
    try {
      pathwaysFiles = DistributedCache.getLocalCacheFiles(conf);
      for (Path path : pathwaysFiles) {
        /*
         * this loop reads all the distributed cache files
         * In fact, the driver program ensures that there is only one distributed cache file
         */
        BufferedReader fis = new BufferedReader(new FileReader(path.toString()));
        weightline = fis.readLine();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

示例#11

0

显示文件

文件： AvroMR.java 项目： EricDoug/Hadoop-Beginner-s-Guide-Code

  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }

示例#12

0

显示文件

文件： TestHive2ActionExecutor.java 项目： jthelin/oozie

  private RunningJob submitAction(Context context, Namespace ns) throws Exception {
    Hive2ActionExecutor ae = new Hive2ActionExecutor();

    WorkflowAction action = context.getAction();

    ae.prepareActionDir(getFileSystem(), context);
    ae.submitLauncher(getFileSystem(), context, action);

    String jobId = action.getExternalId();
    String jobTracker = action.getTrackerUri();
    String consoleUrl = action.getConsoleUrl();
    assertNotNull(jobId);
    assertNotNull(jobTracker);
    assertNotNull(consoleUrl);
    Element e = XmlUtils.parseXml(action.getConf());
    XConfiguration conf =
        new XConfiguration(
            new StringReader(XmlUtils.prettyPrint(e.getChild("configuration", ns)).toString()));
    conf.set("mapred.job.tracker", e.getChildTextTrim("job-tracker", ns));
    conf.set("fs.default.name", e.getChildTextTrim("name-node", ns));
    conf.set("user.name", context.getProtoActionConf().get("user.name"));
    conf.set("group.name", getTestGroup());

    JobConf jobConf = Services.get().get(HadoopAccessorService.class).createJobConf(jobTracker);
    XConfiguration.copy(conf, jobConf);
    String user = jobConf.get("user.name");
    JobClient jobClient =
        Services.get().get(HadoopAccessorService.class).createJobClient(user, jobConf);
    final RunningJob runningJob = jobClient.getJob(JobID.forName(jobId));
    assertNotNull(runningJob);
    return runningJob;
  }

示例#13

0

显示文件

文件： AbstractHadoopBdbStoreBuilderMapper.java 项目： rsumbaly/voldemort

  @Override
  @SuppressWarnings("unchecked")
  public void configure(JobConf conf) {
    super.configure(conf);

    keySerializerDefinition = getStoreDef().getKeySerializer();
    valueSerializerDefinition = getStoreDef().getValueSerializer();

    try {
      SerializerFactory factory = new DefaultSerializerFactory();

      if (conf.get("serializer.factory") != null) {
        factory = (SerializerFactory) Class.forName(conf.get("serializer.factory")).newInstance();
      }

      keySerializer = (Serializer<Object>) factory.getSerializer(keySerializerDefinition);
      valueSerializer = (Serializer<Object>) factory.getSerializer(valueSerializerDefinition);
    } catch (Exception e) {
      throw new RuntimeException(e);
    }

    keyCompressor = new CompressionStrategyFactory().get(keySerializerDefinition.getCompression());
    valueCompressor =
        new CompressionStrategyFactory().get(valueSerializerDefinition.getCompression());

    routingStrategy =
        new ConsistentRoutingStrategy(
            getCluster().getNodes(), getStoreDef().getReplicationFactor());
  }

示例#14

0

显示文件

文件： HadoopConverterJob.java 项目： AlexanderSaydakov/druid

 private static void setJobName(JobConf jobConf, List<DataSegment> segments) {
   if (segments.size() == 1) {
     final DataSegment segment = segments.get(0);
     jobConf.setJobName(
         String.format(
             "druid-convert-%s-%s-%s",
             segment.getDataSource(), segment.getInterval(), segment.getVersion()));
   } else {
     final Set<String> dataSources =
         Sets.newHashSet(
             Iterables.transform(
                 segments,
                 new Function<DataSegment, String>() {
                   @Override
                   public String apply(DataSegment input) {
                     return input.getDataSource();
                   }
                 }));
     final Set<String> versions =
         Sets.newHashSet(
             Iterables.transform(
                 segments,
                 new Function<DataSegment, String>() {
                   @Override
                   public String apply(DataSegment input) {
                     return input.getVersion();
                   }
                 }));
     jobConf.setJobName(
         String.format(
             "druid-convert-%s-%s",
             Arrays.toString(dataSources.toArray()), Arrays.toString(versions.toArray())));
   }
 }

示例#15

0

显示文件

文件： JavaScriptMRBase.java 项目： rdingwell/hadoop-gateway

 public List<JavaScriptSource> setUpSource(JobConf job) {
   List<JavaScriptSource> js = new ArrayList<JavaScriptSource>();
   js.add(
       new JavaScriptSource("emit.js", "function emit(k,v){$mapper.emit(k,v,$output_collector)}"));
   js.add(new JavaScriptSource("map.js", job.get("map.js")));
   js.add(new JavaScriptSource("reduce.js", job.get("reduce.js")));
   js.add(new JavaScriptSource("functions.js", job.get("functions.js")));
   js.add(new JavaScriptSource("filter.js", job.get("filter.js")));
   js.add(new JavaScriptSource("query_id", job.get("query_id")));
   try {
     Path[] files = DistributedCache.getLocalCacheFiles(job);
     if (files != null) {
       for (int i = 0; i < files.length; i++) {
         Path path = files[i];
         if (path.getName().endsWith(".js")) {
           String source = Files.toString(new File(path.toString()), Charset.forName("UTF-8"));
           js.add(new JavaScriptSource(path.getName(), source));
         }
       }
     }
   } catch (IOException ioe) {
     throw new RuntimeException("Couldn't read from DistributedCache", ioe);
   }
   return js;
 }

示例#16

0

显示文件

文件： InstrumentDataflow.java 项目： MarvinLiu0810/HiTune

 private void init() {
   String nodes = conf.get(AnalysisProcessorConfiguration.nodes);
   this.nodelist = String2List(nodes, SEPERATOR_COMMA);
   String status = conf.get("status");
   this.statuslist = String2List(status, SEPERATOR_COMMA);
   parsePhase();
 }

示例#17

0

显示文件

文件： ItemCFJob.java 项目： bytegriffin/recsys-offline

 public void run() throws Exception {
   long startTime = System.currentTimeMillis();
   JobConf conf = new JobConf(ItemCFJob.class);
   conf.setJobName("ItemCF" + System.currentTimeMillis());
   conf.setNumMapTasks(10);
   conf.set(
       "io.serializations",
       "org.apache.hadoop.io.serializer.JavaSerialization,"
           + "org.apache.hadoop.io.serializer.WritableSerialization");
   StringBuilder sb = new StringBuilder();
   sb.append("--input ").append(input);
   sb.append(" --output ").append(output);
   if (flag) {
     sb.append(" --booleanData true");
   } else {
     sb.append(" --booleanData false");
   }
   sb.append(" --similarityClassname " + Constants.mahout_similarityclassname);
   sb.append(" --tempDir ").append(tmp);
   String[] args = sb.toString().split(" ");
   RecommenderJob job = new RecommenderJob();
   job.setConf(conf);
   job.run(args);
   long endTime = System.currentTimeMillis();
   logger.info(
       "recommdation job ["
           + conf.getJobName()
           + "] run finish. it costs"
           + (endTime - startTime) / 1000
           + "s.");
 }

示例#18

0

显示文件

文件： TestHiveAccumuloTableInputFormat.java 项目： Leolh/hive

  @Test
  public void testConfigureAccumuloInputFormatWithIterators() throws Exception {
    AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(conf);
    ColumnMapper columnMapper =
        new ColumnMapper(
            conf.get(AccumuloSerDeParameters.COLUMN_MAPPINGS),
            conf.get(AccumuloSerDeParameters.DEFAULT_STORAGE_TYPE),
            columnNames,
            columnTypes);
    Set<Pair<Text, Text>> cfCqPairs =
        inputformat.getPairCollection(columnMapper.getColumnMappings());
    List<IteratorSetting> iterators = new ArrayList<IteratorSetting>();
    Set<Range> ranges = Collections.singleton(new Range());
    String instanceName = "realInstance";
    String zookeepers = "host1:2181,host2:2181,host3:2181";

    IteratorSetting cfg = new IteratorSetting(50, PrimitiveComparisonFilter.class);
    cfg.addOption(PrimitiveComparisonFilter.P_COMPARE_CLASS, StringCompare.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.COMPARE_OPT_CLASS, Equal.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.CONST_VAL, "dave");
    cfg.addOption(PrimitiveComparisonFilter.COLUMN, "person:name");
    iterators.add(cfg);

    cfg = new IteratorSetting(50, PrimitiveComparisonFilter.class);
    cfg.addOption(PrimitiveComparisonFilter.P_COMPARE_CLASS, IntCompare.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.COMPARE_OPT_CLASS, Equal.class.getName());
    cfg.addOption(PrimitiveComparisonFilter.CONST_VAL, "50");
    cfg.addOption(PrimitiveComparisonFilter.COLUMN, "person:age");
    iterators.add(cfg);

    ZooKeeperInstance zkInstance = Mockito.mock(ZooKeeperInstance.class);
    HiveAccumuloTableInputFormat mockInputFormat = Mockito.mock(HiveAccumuloTableInputFormat.class);

    // Stub out the ZKI mock
    Mockito.when(zkInstance.getInstanceName()).thenReturn(instanceName);
    Mockito.when(zkInstance.getZooKeepers()).thenReturn(zookeepers);

    // Call out to the real configure method
    Mockito.doCallRealMethod()
        .when(mockInputFormat)
        .configure(conf, zkInstance, con, accumuloParams, columnMapper, iterators, ranges);

    // Also compute the correct cf:cq pairs so we can assert the right argument was passed
    Mockito.doCallRealMethod()
        .when(mockInputFormat)
        .getPairCollection(columnMapper.getColumnMappings());

    mockInputFormat.configure(
        conf, zkInstance, con, accumuloParams, columnMapper, iterators, ranges);

    // Verify that the correct methods are invoked on AccumuloInputFormat
    Mockito.verify(mockInputFormat).setZooKeeperInstance(conf, instanceName, zookeepers, false);
    Mockito.verify(mockInputFormat).setConnectorInfo(conf, USER, new PasswordToken(PASS));
    Mockito.verify(mockInputFormat).setInputTableName(conf, TEST_TABLE);
    Mockito.verify(mockInputFormat)
        .setScanAuthorizations(conf, con.securityOperations().getUserAuthorizations(USER));
    Mockito.verify(mockInputFormat).addIterators(conf, iterators);
    Mockito.verify(mockInputFormat).setRanges(conf, ranges);
    Mockito.verify(mockInputFormat).fetchColumns(conf, cfCqPairs);
  }

示例#19

0

显示文件

文件： FileCombiner.java 项目： hfausta/thesis-file-combiner

  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    // TODO Auto-generated method stub
    JobConf conf = new JobConf();
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(5);

    FileSystem fs = FileSystem.get(conf);
    Path dir = new Path(args[0]);
    FileStatus[] stats = fs.listStatus(dir);
    numFiles = stats.length;

    Job job = new Job(conf);
    job.setJarByClass(FileCombiner.class);
    job.setJobName("File Combiner");

    job.setMapperClass(FileCombinerMapper.class);
    job.setReducerClass(FileCombinerReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
  }

示例#20

0

显示文件

文件： Docnos2Titles.java 项目： AdeDZY/Ivory

    public void configure(JobConf job) {
      sLogger.setLevel(Level.INFO);
      srcLang = job.get("fLang");
      mJob = job;
      pwsimMapping = new HMapIV<ArrayListOfIntsWritable>();
      valOut = new PairOfIntString();
      keyOut = new PairOfInts();

      // read doc ids of sample into vectors
      String samplesFile = job.get("Ivory.SampleFile");
      if (samplesFile != null) {
        try {
          samplesMap = readSamplesFromCache(getFilename(samplesFile), job);
        } catch (NumberFormatException e) {
          e.printStackTrace();
          throw new RuntimeException("Incorrect format in " + samplesFile);
        } catch (IOException e) {
          e.printStackTrace();
          throw new RuntimeException("I/O error in " + samplesFile);
        } catch (Exception e) {
          e.printStackTrace();
          throw new RuntimeException("Error reading sample file: " + samplesFile);
        }
      }
    }

示例#21

0

显示文件

文件： CompactorMR.java 项目： nkeywal/hive

    @Override
    public void map(
        WritableComparable key,
        CompactorInputSplit split,
        OutputCollector<NullWritable, NullWritable> nullWritableVOutputCollector,
        Reporter reporter)
        throws IOException {
      // This will only get called once, since CompactRecordReader only returns one record,
      // the input split.
      // Based on the split we're passed we go instantiate the real reader and then iterate on it
      // until it finishes.
      @SuppressWarnings("unchecked") // since there is no way to parametrize instance of Class
      AcidInputFormat<WritableComparable, V> aif =
          instantiate(AcidInputFormat.class, jobConf.get(INPUT_FORMAT_CLASS_NAME));
      ValidTxnList txnList = new ValidReadTxnList(jobConf.get(ValidTxnList.VALID_TXNS_KEY));

      boolean isMajor = jobConf.getBoolean(IS_MAJOR, false);
      AcidInputFormat.RawReader<V> reader =
          aif.getRawReader(
              jobConf,
              isMajor,
              split.getBucket(),
              txnList,
              split.getBaseDir(),
              split.getDeltaDirs());
      RecordIdentifier identifier = reader.createKey();
      V value = reader.createValue();
      getWriter(reporter, reader.getObjectInspector(), split.getBucket());
      while (reader.next(identifier, value)) {
        if (isMajor && reader.isDelete(value)) continue;
        writer.write(value);
        reporter.progress();
      }
    }

示例#22

0

显示文件

文件： TestMerger.java 项目： jonathangizmo/HadoopDistJ

 @SuppressWarnings({"deprecation", "unchecked"})
 public void testMergeShouldReturnProperProgress(List<Segment<Text, Text>> segments)
     throws IOException {
   Path tmpDir = new Path("localpath");
   Class<Text> keyClass = (Class<Text>) jobConf.getMapOutputKeyClass();
   Class<Text> valueClass = (Class<Text>) jobConf.getMapOutputValueClass();
   RawComparator<Text> comparator = jobConf.getOutputKeyComparator();
   Counter readsCounter = new Counter();
   Counter writesCounter = new Counter();
   Progress mergePhase = new Progress();
   RawKeyValueIterator mergeQueue =
       Merger.merge(
           conf,
           fs,
           keyClass,
           valueClass,
           segments,
           2,
           tmpDir,
           comparator,
           getReporter(),
           readsCounter,
           writesCounter,
           mergePhase);
   Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), 0.0f);
 }

示例#23

0

显示文件

文件： Similarity.java 项目： gdfm/similarity-self-join

 @Override
 public void configure(JobConf conf) {
   this.threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD);
   int reducerID = conf.getInt("mapred.task.partition", -1);
   int max = conf.getInt(PARAM_APS_MAXKEY, 0);
   int nstripes = conf.getInt(PARAM_APS_STRIPES, 1);
   int spread = conf.getInt(PARAM_APS_REDUCER_PER_STRIPE, 1);
   if (reducerID < 0 || max == 0) {
     LOG.error("Could not find stripe ID, reverting to whole rest file loading");
     LOG.debug("reducer = " + reducerID + "\t max = " + max + "\t nstripes = " + nstripes);
     // open the pruned part file in the DistrubutedCache
     haspruned = FileUtils.readRestFile(conf, pruned);
   } else {
     int stripe = GenericKey.StripePartitioner.findStripe(reducerID, spread);
     int from = GenericKey.StripePartitioner.minKeyInStripe(stripe, nstripes, max);
     int to = from + GenericKey.StripePartitioner.numKeysInStripe(stripe, nstripes, max);
     // read from 'from' included, to 'to' excluded
     LOG.info(
         "Reducer "
             + reducerID
             + " loading stripe "
             + stripe
             + " of "
             + nstripes
             + " ("
             + from
             + ","
             + (to - 1)
             + ")");
     haspruned = FileUtils.readRestFile(conf, pruned, from, to);
   }
   if (!haspruned) LOG.warn("No pruned file provided in DistributedCache");
   else LOG.info("Read " + pruned.size() + " entries from pruned file");
 }

示例#24

0

显示文件

文件： KMeansMapper.java 项目： RamGhadiyaram/cloud-class

  public void configure(JobConf conf) {
    numberOfCenters = Integer.valueOf(conf.get("numberOfCenters"));
    centersDirectory = conf.get("centersReadDirectory");

    try {
      Configuration c = new Configuration();
      FileSystem fs = FileSystem.get(c);

      for (int index = 0; index < numberOfCenters; ++index) {
        SequenceFile.Reader reader =
            new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c);

        LongWritable key = new LongWritable();
        Point value = new Point();

        reader.next(key, value);

        Point center = (Point) value;

        centers.add(center);

        reader.close();
      }
    } catch (IOException e) {
      // do nothing
      // I hope this doesn't happen
      System.out.println("well, damn.");
      e.printStackTrace();
    }
  }

示例#25

0

显示文件

文件： BuildGraph.java 项目： avijitgupta/contrail-bio

  public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: BuildGraph");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(BuildGraph.class);
    conf.setJobName("BuildGraph " + inputPath + " " + ContrailConfig.K);

    ContrailConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(BuildGraphMapper.class);
    conf.setReducerClass(BuildGraphReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }

示例#26

0

显示文件

文件： CorpusVocabNormalizerAndNumberizer.java 项目： rahulbhawsar/Cloud9

    @Override
    public void close() {
      System.err.println(
          "Target: " + vocE.size() + " types. Writing to " + job_.get("root", null) + "/vocab.E");
      System.err.println(
          "Source: " + vocF.size() + " types .Writing to " + job_.get("root", null) + "/vocab.F");

      // write out vocabulary to file
      try {
        FileSystem fs = FileSystem.get(job_);

        DataOutputStream dos =
            new DataOutputStream(
                new BufferedOutputStream(fs.create(new Path(job_.get("root", null) + "/vocab.E"))));
        ((VocabularyWritable) vocE).write(dos);
        dos.close();
        DataOutputStream dos2 =
            new DataOutputStream(
                new BufferedOutputStream(fs.create(new Path(job_.get("root", null) + "/vocab.F"))));
        ((VocabularyWritable) vocF).write(dos2);
        dos2.close();

      } catch (IOException e) {
        throw new RuntimeException("Vocab couldn't be written to disk.\n" + e.toString());
      }
    }

示例#27

0

显示文件

文件： WordCountExtended.java 项目： st3ffwo3/hadoop_samples

  /**
   * {@inheritDoc}
   *
   * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
   */
  @Override
  public int run(String[] args) throws Exception {
    JobConf configuration = new JobConf(getConf(), WordCountExtended.class);
    configuration.setJobName(JOB_NAME);

    configuration.setOutputKeyClass(Text.class);
    configuration.setOutputValueClass(IntWritable.class);

    configuration.setMapperClass(Map.class);
    configuration.setCombinerClass(Reduce.class);
    configuration.setReducerClass(Reduce.class);

    configuration.setInputFormat(TextInputFormat.class);
    configuration.setOutputFormat(TextOutputFormat.class);

    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      if (JOB_SKIP_ARGUMENT.equals(args[i])) {
        DistributedCache.addCacheFile(new Path(args[++i]).toUri(), configuration);
        configuration.setBoolean(JOB_PARAMETER_SKIP_PATTERNS, true);
      } else {
        otherArgs.add(args[i]);
      }
    }

    FileInputFormat.setInputPaths(configuration, new Path(otherArgs.get(0)));
    FileOutputFormat.setOutputPath(configuration, new Path(otherArgs.get(1)));

    JobClient.runJob(configuration);
    return 0;
  }

示例#28

0

显示文件

文件： PagerankData.java 项目： ConeyLiu/HiBench

  private void setPageRankLinksOptions(JobConf job) throws URISyntaxException {
    job.setLong("pages", options.getNumPages());
    job.setLong("slotpages", options.getNumSlotPages());
    job.set("delimiter", cdelim);

    Utils.shareLinkZipfCore(options, job);
  }

示例#29

0

显示文件

文件： Compressible.java 项目： nabsrock786/Hadoop_NGS_Lakshman

  public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: Compressible");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    // JobConf conf = new JobConf(Stats.class);
    JobConf conf = new JobConf(Compressible.class);
    conf.setJobName("Compressible " + inputPath);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CompressibleMapper.class);
    conf.setReducerClass(CompressibleReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }

示例#30

0

显示文件

文件： CDMahoutEvaluator.java 项目： maximzhao/Mahout-GSOC-LibLinear

  /**
   * Configure the job
   *
   * @param conf Job to configure
   * @param rules classification rules to evaluate
   * @param target label value to evaluate the rules for
   * @param inpath input path (the dataset)
   * @param outpath output <code>Path</code>
   * @param split DatasetSplit used to separate training and testing input
   */
  private static void configureJob(
      JobConf conf,
      List<? extends Rule> rules,
      int target,
      Path inpath,
      Path outpath,
      DatasetSplit split) {
    split.storeJobParameters(conf);

    FileInputFormat.setInputPaths(conf, inpath);
    FileOutputFormat.setOutputPath(conf, outpath);

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(CDFitness.class);

    conf.setMapperClass(CDMapper.class);
    conf.setCombinerClass(CDReducer.class);
    conf.setReducerClass(CDReducer.class);

    conf.setInputFormat(DatasetTextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    // store the parameters
    conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules));
    conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
    conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target);
  }