示例#1
0
  /**
   * Check duplicated tweet IDs in <b>tweetIdDir</b>, and output the duplicates to stdout.
   *
   * @param tweetIdDir
   * @throws Exception
   */
  public static void checkTidDuplicates(String tweetIdDir) throws Exception {
    // First change path strings to URI strings starting with 'file:' or 'hdfs:'
    tweetIdDir = MultiFileFolderWriter.getUriStrForPath(tweetIdDir);

    Set<String> tidSet = new HashSet<String>();
    Configuration conf = HBaseConfiguration.create();
    FileSystem fs = FileSystem.get(new URI(tweetIdDir), conf);
    int dupCount = 0;
    for (FileStatus srcFileStatus : fs.listStatus(new Path(tweetIdDir))) {
      String srcFileName = srcFileStatus.getPath().getName();
      if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) {
        BufferedReader brTid =
            new BufferedReader(new InputStreamReader(fs.open(srcFileStatus.getPath())));
        String tid = brTid.readLine();
        while (tid != null) {
          if (tidSet.contains(tid)) {
            System.out.println("Duplicated tweet ID: " + tid);
            dupCount++;
          } else {
            tidSet.add(tid);
          }
          tid = brTid.readLine();
        }
        brTid.close();
      }
    }
    System.out.println(
        "Number of unique tweet IDs: " + tidSet.size() + ", number of duplicates: " + dupCount);
  }
示例#2
0
  /**
   * Calculate the union of all tweet IDs contained in the tweet ID files under <b>srcTidDirs</b>,
   * and write the results to tweet ID files under <b>dstTidDir</b>.
   *
   * @param srcTidDirs Paths to source directories containing tweet ID files.
   * @param dstTidDir Path to the destination directory for the result tweet ID files.
   * @param nTidPerResultFile Number of tweet IDs per file in the destination directory.
   * @throws Exception
   */
  public static void getTidFileUnion(String[] srcTidDirs, String dstTidDir, int nTidPerResultFile)
      throws Exception {
    // First change path strings to URI strings starting with 'file:' or 'hdfs:'
    for (int i = 0; i < srcTidDirs.length; i++) {
      srcTidDirs[i] = MultiFileFolderWriter.getUriStrForPath(srcTidDirs[i]);
    }
    dstTidDir = MultiFileFolderWriter.getUriStrForPath(dstTidDir);
    if (!MultiFileFolderWriter.deleteIfExist(dstTidDir)) {
      throw new Exception("Failed to delete result directory " + dstTidDir);
    }

    // group source tweet ID files by month
    Configuration conf = HBaseConfiguration.create();
    FileSystem fs = FileSystem.get(new URI(srcTidDirs[0]), conf);
    HashMap<String, List<Path>> monthTidFileMap = new HashMap<String, List<Path>>();
    int idx = -1;
    for (String srcTidDirUri : srcTidDirs) {
      for (FileStatus srcFileStatus : fs.listStatus(new Path(srcTidDirUri))) {
        String srcFileName = srcFileStatus.getPath().getName();
        if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) {
          idx = srcFileName.indexOf('_');
          String month = srcFileName.substring(0, idx);
          List<Path> paths = monthTidFileMap.get(month);
          if (paths != null) {
            paths.add(srcFileStatus.getPath());
          } else {
            paths = new LinkedList<Path>();
            paths.add(srcFileStatus.getPath());
            monthTidFileMap.put(month, paths);
          }
        }
      }
    }

    // calculate the tweet ID union by month
    int count = 0;
    for (Map.Entry<String, List<Path>> e : monthTidFileMap.entrySet()) {
      String month = e.getKey();
      boolean useBigInt = TruthyHelpers.checkIfb4June2015(month);
      List<Path> srcTidPaths = e.getValue();
      System.out.println("Calculating union for " + month + "...");
      PriorityQueue<TweetIdHeapEntry> tidHeap =
          new PriorityQueue<TweetIdHeapEntry>(srcTidPaths.size());
      for (Path p : srcTidPaths) {
        BufferedReader brTid = new BufferedReader(new InputStreamReader(fs.open(p)));
        String tid = brTid.readLine();
        if (tid == null) {
          brTid.close();
          continue;
        }
        tidHeap.offer(new TweetIdHeapEntry(tid, brTid, useBigInt));
        count++;
      }
      MultiFileFolderWriter resWriter =
          new MultiFileFolderWriter(dstTidDir, month + "_tweetIds", false, nTidPerResultFile);
      TweetIdHeapEntry he = null;
      byte[] lastTidBytes = null;
      while (!tidHeap.isEmpty()) {
        he = tidHeap.remove();
        if (lastTidBytes == null
            || Bytes.BYTES_COMPARATOR.compare(lastTidBytes, he.tweetIdBytes) != 0) {
          resWriter.writeln(he.tweetIdStr);
          lastTidBytes = he.tweetIdBytes;
        }
        if (he.moveToNextId()) {
          tidHeap.offer(he);
          count++;
          if (count % 10000 == 0) {
            System.out.println("Processed " + count + " source tweet IDs.");
          }
        }
      }
      resWriter.close();
    }
    System.out.println("Done. Total number of IDs processed: " + count);
  }