@Override public void write(HadoopTwitterTokenToolOptions opts, TwitterTokenMode completedMode) throws Exception { this.stages = new MultiStagedJob( HadoopToolsUtil.getInputPaths( completedMode.finalOutput(opts), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(outputPath), opts.getArgs()); // Three stage process // 1a. Write all the words (word per line) new WordIndex().stage(stages); final Path wordIndex = stages.runAll(); HashMap<String, IndependentPair<Long, Long>> wordCountLines = WordIndex.readWordCountLines(wordIndex.toString(), ""); StatsWordMatch matches = new StatsWordMatch(); for (Entry<String, IndependentPair<Long, Long>> entry : wordCountLines.entrySet()) { String word = entry.getKey(); IndependentPair<Long, Long> countLine = entry.getValue(); Long count = countLine.firstObject(); matches.updateStats(word, count); } System.out.println(matches); }
@Override public void write(HadoopTwitterTokenToolOptions opts, TwitterTokenMode completedMode) throws Exception { MultiStagedJob stages; stages = new MultiStagedJob( HadoopToolsUtil.getInputPaths( completedMode.finalOutput(opts), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(outputPath), opts.getArgs()); stages.queueStage(new TimeWordJacardIndex()); stages.runAll(); }
/** * from a report output path get the words * * @param path report output path * @param ext where the words are in the path * @return map of words to counts and index * @throws IOException */ public static LinkedHashMap<String, IndependentPair<Long, Long>> readWordCountLines( String path, String ext) throws IOException { String wordPath = path + ext; Path p = HadoopToolsUtil.getInputPaths(wordPath)[0]; FileSystem fs = HadoopToolsUtil.getFileSystem(p); FSDataInputStream toRead = fs.open(p); BufferedReader reader = new BufferedReader(new InputStreamReader(toRead, "UTF-8")); CSVParser csvreader = new CSVParser(reader); long lineN = 0; String[] next = null; LinkedHashMap<String, IndependentPair<Long, Long>> toRet = new LinkedHashMap<String, IndependentPair<Long, Long>>(); while ((next = csvreader.getLine()) != null && next.length > 0) { if (next.length != 2) { System.out.println("PROBLEM READLINE LINE: " + Arrays.toString(next)); continue; } toRet.put(next[0], IndependentPair.pair(Long.parseLong(next[1]), lineN)); lineN++; } return toRet; }