/** * Check duplicated tweet IDs in <b>tweetIdDir</b>, and output the duplicates to stdout. * * @param tweetIdDir * @throws Exception */ public static void checkTidDuplicates(String tweetIdDir) throws Exception { // First change path strings to URI strings starting with 'file:' or 'hdfs:' tweetIdDir = MultiFileFolderWriter.getUriStrForPath(tweetIdDir); Set<String> tidSet = new HashSet<String>(); Configuration conf = HBaseConfiguration.create(); FileSystem fs = FileSystem.get(new URI(tweetIdDir), conf); int dupCount = 0; for (FileStatus srcFileStatus : fs.listStatus(new Path(tweetIdDir))) { String srcFileName = srcFileStatus.getPath().getName(); if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) { BufferedReader brTid = new BufferedReader(new InputStreamReader(fs.open(srcFileStatus.getPath()))); String tid = brTid.readLine(); while (tid != null) { if (tidSet.contains(tid)) { System.out.println("Duplicated tweet ID: " + tid); dupCount++; } else { tidSet.add(tid); } tid = brTid.readLine(); } brTid.close(); } } System.out.println( "Number of unique tweet IDs: " + tidSet.size() + ", number of duplicates: " + dupCount); }
/** * Calculate the union of all tweet IDs contained in the tweet ID files under <b>srcTidDirs</b>, * and write the results to tweet ID files under <b>dstTidDir</b>. * * @param srcTidDirs Paths to source directories containing tweet ID files. * @param dstTidDir Path to the destination directory for the result tweet ID files. * @param nTidPerResultFile Number of tweet IDs per file in the destination directory. * @throws Exception */ public static void getTidFileUnion(String[] srcTidDirs, String dstTidDir, int nTidPerResultFile) throws Exception { // First change path strings to URI strings starting with 'file:' or 'hdfs:' for (int i = 0; i < srcTidDirs.length; i++) { srcTidDirs[i] = MultiFileFolderWriter.getUriStrForPath(srcTidDirs[i]); } dstTidDir = MultiFileFolderWriter.getUriStrForPath(dstTidDir); if (!MultiFileFolderWriter.deleteIfExist(dstTidDir)) { throw new Exception("Failed to delete result directory " + dstTidDir); } // group source tweet ID files by month Configuration conf = HBaseConfiguration.create(); FileSystem fs = FileSystem.get(new URI(srcTidDirs[0]), conf); HashMap<String, List<Path>> monthTidFileMap = new HashMap<String, List<Path>>(); int idx = -1; for (String srcTidDirUri : srcTidDirs) { for (FileStatus srcFileStatus : fs.listStatus(new Path(srcTidDirUri))) { String srcFileName = srcFileStatus.getPath().getName(); if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) { idx = srcFileName.indexOf('_'); String month = srcFileName.substring(0, idx); List<Path> paths = monthTidFileMap.get(month); if (paths != null) { paths.add(srcFileStatus.getPath()); } else { paths = new LinkedList<Path>(); paths.add(srcFileStatus.getPath()); monthTidFileMap.put(month, paths); } } } } // calculate the tweet ID union by month int count = 0; for (Map.Entry<String, List<Path>> e : monthTidFileMap.entrySet()) { String month = e.getKey(); boolean useBigInt = TruthyHelpers.checkIfb4June2015(month); List<Path> srcTidPaths = e.getValue(); System.out.println("Calculating union for " + month + "..."); PriorityQueue<TweetIdHeapEntry> tidHeap = new PriorityQueue<TweetIdHeapEntry>(srcTidPaths.size()); for (Path p : srcTidPaths) { BufferedReader brTid = new BufferedReader(new InputStreamReader(fs.open(p))); String tid = brTid.readLine(); if (tid == null) { brTid.close(); continue; } tidHeap.offer(new TweetIdHeapEntry(tid, brTid, useBigInt)); count++; } MultiFileFolderWriter resWriter = new MultiFileFolderWriter(dstTidDir, month + "_tweetIds", false, nTidPerResultFile); TweetIdHeapEntry he = null; byte[] lastTidBytes = null; while (!tidHeap.isEmpty()) { he = tidHeap.remove(); if (lastTidBytes == null || Bytes.BYTES_COMPARATOR.compare(lastTidBytes, he.tweetIdBytes) != 0) { resWriter.writeln(he.tweetIdStr); lastTidBytes = he.tweetIdBytes; } if (he.moveToNextId()) { tidHeap.offer(he); count++; if (count % 10000 == 0) { System.out.println("Processed " + count + " source tweet IDs."); } } } resWriter.close(); } System.out.println("Done. Total number of IDs processed: " + count); }