/** * evaluate recall of blocking * * @param blockFile * @param stdAns * @throws Exception */ public static void evaluate(String blockFile, String stdAns, String output) throws Exception { HashSet<String> stdSet = Common.getStringSet(stdAns); HashSet<String> resSet = new HashSet<String>(); BufferedReader br = IOFactory.getBufferedReader(blockFile); int overlap = 0; int maxBlockSize = 0; int blockNum = 0; for (String line = br.readLine(); line != null; line = br.readLine()) { int[] docNums = Common.getNumsInLineSorted(line); if (docNums.length > maxBlockSize) maxBlockSize = docNums.length; // System.out.println(docNums.length); blockNum++; for (int i = 0; i < docNums.length; i++) for (int j = 0; j < i; j++) { String toTest = docNums[i] + " " + docNums[j]; if (stdSet.contains(toTest)) { overlap++; stdSet.remove(toTest); // to avoid duplicate counting } resSet.add(toTest); } } br.close(); Common.printResult(overlap, stdAns, resSet.size(), output); }
public static void getMissingPairs(String blockFile, String sameAsFile, String output) throws Exception { HashSet<String> stdSet = Common.getStringSet(sameAsFile); BufferedReader br = IOFactory.getBufferedReader(blockFile); int lineCount = 0; for (String line = br.readLine(); line != null; line = br.readLine()) { int[] docNums = Common.getNumsInLineSorted(line); for (int i = 0; i < docNums.length; i++) for (int j = 0; j < i; j++) { String toTest = docNums[i] + " " + docNums[j]; if (stdSet.contains(toTest)) { stdSet.remove(toTest); } } lineCount++; if (lineCount % 1000 == 0) System.out.println(lineCount); } br.close(); PrintWriter pw = IOFactory.getPrintWriter(output); for (String s : stdSet) pw.println(s); pw.close(); }
public static float getRecall(String blockFile, String stdAns) throws Exception { HashSet<String> stdSet = Common.getStringSet(stdAns); int ansSize = stdSet.size(); System.out.println("answer size: " + ansSize); BufferedReader br = IOFactory.getBufferedReader(blockFile); int overlap = 0; for (String line = br.readLine(); line != null; line = br.readLine()) { int[] docNums = Common.getNumsInLineSorted(line); for (int i = 0; i < docNums.length; i++) for (int j = 0; j < i; j++) { String toTest = docNums[i] + " " + docNums[j]; if (stdSet.contains(toTest)) { overlap++; if (overlap % 10000 == 0) System.out.println(new Date().toString() + " : " + overlap + " overlaps"); stdSet.remove(toTest); // to avoid duplicate counting } } } br.close(); System.out.println("overlap: " + overlap); return (overlap + 0.0f) / ansSize; }