@Override public int run(String[] args) throws Exception { final int ret = parseArgs(args); if (ret < 0) { return ret; } Job job = Job.getInstance(getConf()); job.setJarByClass(GreeDiFirst.class); job.setJobName(String.format("Coverage-GreeDiFirst[%s %s]", partitionCount, selectCount)); job.getConfiguration().setInt(Fields.PARTITION_COUNT.get(), partitionCount); job.getConfiguration().setInt(Fields.SELECT_COUNT.get(), selectCount); job.setNumReduceTasks(partitionCount); SetupHelper.getInstance().setSequenceInput(job, inputPath).setTextOutput(job, outputPath); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DocumentWithVectorWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setReducerClass(GreeDiReducer.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
@Override public void map(Text key, VectorWritable value, Context context) throws IOException, InterruptedException { final int partitionCount = context .getConfiguration() .getInt(Fields.PARTITION_COUNT.get(), Defaults.PARTITION_COUNT.get()); final int partition = Integer.parseInt(key.toString()) % partitionCount; final IntWritable outKey = new IntWritable(partition); final DocumentWithVectorWritable outValue = new DocumentWithVectorWritable(key, value); context.write(outKey, outValue); }
@SuppressWarnings("static-access") private int parseArgs(String[] args) { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("Tfidf vectors") .create(Fields.INPUT.get())); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("Selected articles") .create(Fields.OUTPUT.get())); options.addOption( OptionBuilder.withArgName("integer") .hasArg() .withDescription("Partition count") .create(Fields.PARTITION_COUNT.get())); options.addOption( OptionBuilder.withArgName("integer") .hasArg() .withDescription("Select count") .create(Fields.SELECT_COUNT.get())); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { logger.fatal("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(Fields.INPUT.get()) || !cmdline.hasOption(Fields.OUTPUT.get())) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } inputPath = cmdline.getOptionValue(Fields.INPUT.get()); outputPath = cmdline.getOptionValue(Fields.OUTPUT.get()); partitionCount = Defaults.PARTITION_COUNT.get(); if (cmdline.hasOption(Fields.PARTITION_COUNT.get())) { partitionCount = Integer.parseInt(cmdline.getOptionValue(Fields.PARTITION_COUNT.get())); if (partitionCount <= 0) { System.err.println("Error: \"" + partitionCount + "\" has to be positive!"); return -1; } } selectCount = Integer.parseInt(cmdline.getOptionValue(Fields.SELECT_COUNT.get())); logger.info("Tool name: " + this.getClass().getName()); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - partitions: " + partitionCount); logger.info(" - select: " + selectCount); return 0; }