@Override public void stage(MultiStagedJob mjob) { mjob.removeIntermediate(true); SequenceFileStage<Text, BytesWritable, Text, LongWritable, LongWritable, Text> collateWords = new SequenceFileStage<Text, BytesWritable, Text, LongWritable, LongWritable, Text>() { @Override public void setup(Job job) { job.getConfiguration().setInt(WORDCOUNT_THRESH, wordCountThreshold); job.getConfiguration().setInt(WORDCOUNT_TIMETHRESH, wordTimeThreshold); job.setNumReduceTasks(1); } @Override public Class<? extends Mapper<Text, BytesWritable, Text, LongWritable>> mapper() { return WordIndex.Map.class; } @Override public Class<? extends Reducer<Text, LongWritable, LongWritable, Text>> reducer() { return WordIndex.Reduce.class; } @Override public String outname() { return "words-collated"; } }; SequenceFileTextStage<LongWritable, Text, LongWritable, Text, NullWritable, Text> sortedWords = new SequenceFileTextStage<LongWritable, Text, LongWritable, Text, NullWritable, Text>() { @Override public void setup(Job job) { job.getConfiguration().setInt(WORDCOUNT_TOPN, topNWords); job.setSortComparatorClass(LongWritable.DecreasingComparator.class); job.setNumReduceTasks(1); } @Override public Class<? extends Reducer<LongWritable, Text, NullWritable, Text>> reducer() { return WordIndexSort.Reduce.class; } @Override public String outname() { return "words"; } }; mjob.queueStage(collateWords); mjob.queueStage(sortedWords); }
@Override public void write(HadoopTwitterTokenToolOptions opts, TwitterTokenMode completedMode) throws Exception { MultiStagedJob stages; stages = new MultiStagedJob( HadoopToolsUtil.getInputPaths( completedMode.finalOutput(opts), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(outputPath), opts.getArgs()); stages.queueStage(new TimeWordJacardIndex()); stages.runAll(); }
@Override public void write(HadoopTwitterTokenToolOptions opts, TwitterTokenMode completedMode) throws Exception { this.stages = new MultiStagedJob( HadoopToolsUtil.getInputPaths( completedMode.finalOutput(opts), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(outputPath), opts.getArgs()); // Three stage process // 1a. Write all the words (word per line) new WordIndex().stage(stages); final Path wordIndex = stages.runAll(); HashMap<String, IndependentPair<Long, Long>> wordCountLines = WordIndex.readWordCountLines(wordIndex.toString(), ""); StatsWordMatch matches = new StatsWordMatch(); for (Entry<String, IndependentPair<Long, Long>> entry : wordCountLines.entrySet()) { String word = entry.getKey(); IndependentPair<Long, Long> countLine = entry.getValue(); Long count = countLine.firstObject(); matches.updateStats(word, count); } System.out.println(matches); }