/** Compute number of correct pairs betwn src1 and src2, where src2>src1 */ protected int countCorrectPairs(MatchData data) { // count the number of times each id appears in each source */ Map counter = new HashMap(); for (int i = 0; i < data.numSources(); i++) { String src = data.getSource(i); for (int j = 0; j < data.numInstances(src); j++) { String id = data.getInstance(src, j).getId(); if (id != null) { IdKey key = new IdKey(id, src); Integer c = (Integer) counter.get(key); counter.put(key, (c == null ? new Integer(1) : new Integer(c.intValue() + 1))); } } } /* // show the counter for (Iterator i=counter.keySet().iterator(); i.hasNext(); ) { IdKey key = (IdKey) i.next(); System.out.println( key.src+"#"+key.id+" = "+counter.get(key) ); } */ // count the number of correct pairs int numCorrectPairs = 0; Set idsInSrc1 = new HashSet(); for (int i = 0; i < data.numSources(); i++) { String src1 = data.getSource(i); idsInSrc1.clear(); for (int j = 0; j < data.numInstances(src1); j++) { String id = data.getInstance(src1, j).getId(); idsInSrc1.add(id); for (int k = i + 1; k < data.numSources(); k++) { String src2 = data.getSource(k); Integer cInteger = (Integer) counter.get(new IdKey(id, src2)); if (cInteger != null) { numCorrectPairs += cInteger.intValue(); } // System.out.println( "src1:"+src1+" id:"+id+" src2:"+src2+" c:"+cInteger); } } if (clusterMode) { // count how often something in src1 can be matched correctly with something // else in src1 for (Iterator j = idsInSrc1.iterator(); j.hasNext(); ) { String id = (String) j.next(); Integer cInteger = (Integer) counter.get(new IdKey(id, src1)); int c = cInteger.intValue(); numCorrectPairs += c * (c - 1) / 2; } } } return numCorrectPairs; }
/** Accumulate statistics on how often each token value occurs */ public void train(StringWrapperIterator i) { Set seenTokens = new HashSet(); while (i.hasNext()) { BagOfTokens bag = asBagOfTokens(i.nextStringWrapper()); seenTokens.clear(); for (Iterator j = bag.tokenIterator(); j.hasNext(); ) { totalTokenCount++; Token tokj = (Token) j.next(); if (!seenTokens.contains(tokj)) { seenTokens.add(tokj); // increment documentFrequency counts Integer df = (Integer) documentFrequency.get(tokj); if (df == null) documentFrequency.put(tokj, ONE); else if (df == ONE) documentFrequency.put(tokj, TWO); else if (df == TWO) documentFrequency.put(tokj, THREE); else documentFrequency.put(tokj, new Integer(df.intValue() + 1)); } } collectionSize++; } }