@Override public void write(HadoopTwitterTokenToolOptions opts, TwitterTokenMode completedMode) throws Exception { this.stages = new MultiStagedJob( HadoopToolsUtil.getInputPaths( completedMode.finalOutput(opts), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(outputPath), opts.getArgs()); // Three stage process // 1a. Write all the words (word per line) new WordIndex().stage(stages); final Path wordIndex = stages.runAll(); HashMap<String, IndependentPair<Long, Long>> wordCountLines = WordIndex.readWordCountLines(wordIndex.toString(), ""); StatsWordMatch matches = new StatsWordMatch(); for (Entry<String, IndependentPair<Long, Long>> entry : wordCountLines.entrySet()) { String word = entry.getKey(); IndependentPair<Long, Long> countLine = entry.getValue(); Long count = countLine.firstObject(); matches.updateStats(word, count); } System.out.println(matches); }
@Override public void write(HadoopTwitterTokenToolOptions opts, TwitterTokenMode completedMode) throws Exception { MultiStagedJob stages; stages = new MultiStagedJob( HadoopToolsUtil.getInputPaths( completedMode.finalOutput(opts), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(outputPath), opts.getArgs()); stages.queueStage(new TimeWordJacardIndex()); stages.runAll(); }
/** * from a report output path get the words * * @param path report output path * @param ext where the words are in the path * @return map of words to counts and index * @throws IOException */ public static LinkedHashMap<String, IndependentPair<Long, Long>> readWordCountLines( String path, String ext) throws IOException { String wordPath = path + ext; Path p = HadoopToolsUtil.getInputPaths(wordPath)[0]; FileSystem fs = HadoopToolsUtil.getFileSystem(p); FSDataInputStream toRead = fs.open(p); BufferedReader reader = new BufferedReader(new InputStreamReader(toRead, "UTF-8")); CSVParser csvreader = new CSVParser(reader); long lineN = 0; String[] next = null; LinkedHashMap<String, IndependentPair<Long, Long>> toRet = new LinkedHashMap<String, IndependentPair<Long, Long>>(); while ((next = csvreader.getLine()) != null && next.length > 0) { if (next.length != 2) { System.out.println("PROBLEM READLINE LINE: " + Arrays.toString(next)); continue; } toRet.put(next[0], IndependentPair.pair(Long.parseLong(next[1]), lineN)); lineN++; } return toRet; }
/** * Write a CSV wordIndex to a {@link MLCell} writen to a .mat data file * * @param path * @throws IOException */ public static void writeToMatlab(String path) throws IOException { Path wordMatPath = new Path(path + "/words/wordIndex.mat"); FileSystem fs = HadoopToolsUtil.getFileSystem(wordMatPath); LinkedHashMap<String, IndependentPair<Long, Long>> wordIndex = readWordCountLines(path); MLCell wordCell = new MLCell("words", new int[] {wordIndex.size(), 2}); System.out.println("... reading words"); for (Entry<String, IndependentPair<Long, Long>> ent : wordIndex.entrySet()) { String word = ent.getKey(); int wordCellIndex = (int) (long) ent.getValue().secondObject(); long count = ent.getValue().firstObject(); wordCell.set(new MLChar(null, word), wordCellIndex, 0); wordCell.set(new MLDouble(null, new double[][] {new double[] {count}}), wordCellIndex, 1); } ArrayList<MLArray> list = new ArrayList<MLArray>(); list.add(wordCell); new MatFileWriter(fs.create(wordMatPath), list); }