/** Add a single instance, with given src and id, to the datafile */ public void addInstance(String src, String id, String text) { Instance inst = new Instance(src, id, text); ArrayList list = (ArrayList) sourceLists.get(src); if (list == null) { list = new ArrayList(); sourceLists.put(src, list); sourceNames.add(src); } list.add(inst); }
/** Compute number of correct pairs betwn src1 and src2, where src2>src1 */ protected int countCorrectPairs(MatchData data) { // count the number of times each id appears in each source */ Map counter = new HashMap(); for (int i = 0; i < data.numSources(); i++) { String src = data.getSource(i); for (int j = 0; j < data.numInstances(src); j++) { String id = data.getInstance(src, j).getId(); if (id != null) { IdKey key = new IdKey(id, src); Integer c = (Integer) counter.get(key); counter.put(key, (c == null ? new Integer(1) : new Integer(c.intValue() + 1))); } } } /* // show the counter for (Iterator i=counter.keySet().iterator(); i.hasNext(); ) { IdKey key = (IdKey) i.next(); System.out.println( key.src+"#"+key.id+" = "+counter.get(key) ); } */ // count the number of correct pairs int numCorrectPairs = 0; Set idsInSrc1 = new HashSet(); for (int i = 0; i < data.numSources(); i++) { String src1 = data.getSource(i); idsInSrc1.clear(); for (int j = 0; j < data.numInstances(src1); j++) { String id = data.getInstance(src1, j).getId(); idsInSrc1.add(id); for (int k = i + 1; k < data.numSources(); k++) { String src2 = data.getSource(k); Integer cInteger = (Integer) counter.get(new IdKey(id, src2)); if (cInteger != null) { numCorrectPairs += cInteger.intValue(); } // System.out.println( "src1:"+src1+" id:"+id+" src2:"+src2+" c:"+cInteger); } } if (clusterMode) { // count how often something in src1 can be matched correctly with something // else in src1 for (Iterator j = idsInSrc1.iterator(); j.hasNext(); ) { String id = (String) j.next(); Integer cInteger = (Integer) counter.get(new IdKey(id, src1)); int c = cInteger.intValue(); numCorrectPairs += c * (c - 1) / 2; } } } return numCorrectPairs; }
/** Accumulate statistics on how often each token value occurs */ public void train(StringWrapperIterator i) { Set seenTokens = new HashSet(); while (i.hasNext()) { BagOfTokens bag = asBagOfTokens(i.nextStringWrapper()); seenTokens.clear(); for (Iterator j = bag.tokenIterator(); j.hasNext(); ) { totalTokenCount++; Token tokj = (Token) j.next(); if (!seenTokens.contains(tokj)) { seenTokens.add(tokj); // increment documentFrequency counts Integer df = (Integer) documentFrequency.get(tokj); if (df == null) documentFrequency.put(tokj, ONE); else if (df == ONE) documentFrequency.put(tokj, TWO); else if (df == TWO) documentFrequency.put(tokj, THREE); else documentFrequency.put(tokj, new Integer(df.intValue() + 1)); } } collectionSize++; } }
public Iterator tokenIterator() { return documentFrequency.keySet().iterator(); }
public int getDocumentFrequency(Token tok) { Integer freqInteger = (Integer) documentFrequency.get(tok); if (freqInteger == null) return 0; else return freqInteger.intValue(); }
/** Get the j-th record for the named source. */ public Instance getInstance(String src, int j) { return (Instance) ((ArrayList) sourceLists.get(src)).get(j); }
/** Number of records for source with given string id */ public int numInstances(String src) { return ((ArrayList) sourceLists.get(src)).size(); }