/**
  * evaluate recall of blocking
  *
  * @param blockFile
  * @param stdAns
  * @throws Exception
  */
 public static void evaluate(String blockFile, String stdAns, String output) throws Exception {
   HashSet<String> stdSet = Common.getStringSet(stdAns);
   HashSet<String> resSet = new HashSet<String>();
   BufferedReader br = IOFactory.getBufferedReader(blockFile);
   int overlap = 0;
   int maxBlockSize = 0;
   int blockNum = 0;
   for (String line = br.readLine(); line != null; line = br.readLine()) {
     int[] docNums = Common.getNumsInLineSorted(line);
     if (docNums.length > maxBlockSize) maxBlockSize = docNums.length;
     //			System.out.println(docNums.length);
     blockNum++;
     for (int i = 0; i < docNums.length; i++)
       for (int j = 0; j < i; j++) {
         String toTest = docNums[i] + " " + docNums[j];
         if (stdSet.contains(toTest)) {
           overlap++;
           stdSet.remove(toTest); // to avoid duplicate counting
         }
         resSet.add(toTest);
       }
   }
   br.close();
   Common.printResult(overlap, stdAns, resSet.size(), output);
 }
 /**
  * dump candidate pairs in blocks to evaluate recall of blocking
  *
  * @param blockFile
  * @param stdAns
  * @throws Exception
  */
 public static void dumpCanPairs(String blockFile, String output) throws Exception {
   BufferedReader br = IOFactory.getBufferedReader(blockFile);
   PrintWriter pw = IOFactory.getPrintWriter(output);
   for (String line = br.readLine(); line != null; line = br.readLine()) {
     int[] docNums = Common.getNumsInLineSorted(line);
     for (int i = 0; i < docNums.length; i++)
       for (int j = 0; j < i; j++) {
         String toTest = docNums[i] + " " + docNums[j];
         pw.println(toTest);
       }
   }
   br.close();
   pw.close();
 }
 public static void getMissingPairs(String blockFile, String sameAsFile, String output)
     throws Exception {
   HashSet<String> stdSet = Common.getStringSet(sameAsFile);
   BufferedReader br = IOFactory.getBufferedReader(blockFile);
   int lineCount = 0;
   for (String line = br.readLine(); line != null; line = br.readLine()) {
     int[] docNums = Common.getNumsInLineSorted(line);
     for (int i = 0; i < docNums.length; i++)
       for (int j = 0; j < i; j++) {
         String toTest = docNums[i] + " " + docNums[j];
         if (stdSet.contains(toTest)) {
           stdSet.remove(toTest);
         }
       }
     lineCount++;
     if (lineCount % 1000 == 0) System.out.println(lineCount);
   }
   br.close();
   PrintWriter pw = IOFactory.getPrintWriter(output);
   for (String s : stdSet) pw.println(s);
   pw.close();
 }
 public static float getRecall(String blockFile, String stdAns) throws Exception {
   HashSet<String> stdSet = Common.getStringSet(stdAns);
   int ansSize = stdSet.size();
   System.out.println("answer size: " + ansSize);
   BufferedReader br = IOFactory.getBufferedReader(blockFile);
   int overlap = 0;
   for (String line = br.readLine(); line != null; line = br.readLine()) {
     int[] docNums = Common.getNumsInLineSorted(line);
     for (int i = 0; i < docNums.length; i++)
       for (int j = 0; j < i; j++) {
         String toTest = docNums[i] + " " + docNums[j];
         if (stdSet.contains(toTest)) {
           overlap++;
           if (overlap % 10000 == 0)
             System.out.println(new Date().toString() + " : " + overlap + " overlaps");
           stdSet.remove(toTest); // to avoid duplicate counting
         }
       }
   }
   br.close();
   System.out.println("overlap: " + overlap);
   return (overlap + 0.0f) / ansSize;
 }