Beispiel #1
0
 /** Add a single instance, with given src and id, to the datafile */
 public void addInstance(String src, String id, String text) {
   Instance inst = new Instance(src, id, text);
   ArrayList list = (ArrayList) sourceLists.get(src);
   if (list == null) {
     list = new ArrayList();
     sourceLists.put(src, list);
     sourceNames.add(src);
   }
   list.add(inst);
 }
Beispiel #2
0
  /** Compute number of correct pairs betwn src1 and src2, where src2>src1 */
  protected int countCorrectPairs(MatchData data) {
    // count the number of times each id appears in each source */
    Map counter = new HashMap();
    for (int i = 0; i < data.numSources(); i++) {
      String src = data.getSource(i);
      for (int j = 0; j < data.numInstances(src); j++) {
        String id = data.getInstance(src, j).getId();
        if (id != null) {
          IdKey key = new IdKey(id, src);
          Integer c = (Integer) counter.get(key);
          counter.put(key, (c == null ? new Integer(1) : new Integer(c.intValue() + 1)));
        }
      }
    }

    /*
    // show the counter
    for (Iterator i=counter.keySet().iterator(); i.hasNext(); ) {
    	IdKey key = (IdKey) i.next();
    	System.out.println( key.src+"#"+key.id+" = "+counter.get(key) );
    }
    */

    // count the number of correct pairs
    int numCorrectPairs = 0;
    Set idsInSrc1 = new HashSet();
    for (int i = 0; i < data.numSources(); i++) {
      String src1 = data.getSource(i);
      idsInSrc1.clear();
      for (int j = 0; j < data.numInstances(src1); j++) {
        String id = data.getInstance(src1, j).getId();
        idsInSrc1.add(id);
        for (int k = i + 1; k < data.numSources(); k++) {
          String src2 = data.getSource(k);
          Integer cInteger = (Integer) counter.get(new IdKey(id, src2));
          if (cInteger != null) {
            numCorrectPairs += cInteger.intValue();
          }
          // System.out.println( "src1:"+src1+" id:"+id+" src2:"+src2+" c:"+cInteger);
        }
      }
      if (clusterMode) {
        // count how often something in src1 can be matched correctly with something
        // else in src1
        for (Iterator j = idsInSrc1.iterator(); j.hasNext(); ) {
          String id = (String) j.next();
          Integer cInteger = (Integer) counter.get(new IdKey(id, src1));
          int c = cInteger.intValue();
          numCorrectPairs += c * (c - 1) / 2;
        }
      }
    }
    return numCorrectPairs;
  }
 /** Accumulate statistics on how often each token value occurs */
 public void train(StringWrapperIterator i) {
   Set seenTokens = new HashSet();
   while (i.hasNext()) {
     BagOfTokens bag = asBagOfTokens(i.nextStringWrapper());
     seenTokens.clear();
     for (Iterator j = bag.tokenIterator(); j.hasNext(); ) {
       totalTokenCount++;
       Token tokj = (Token) j.next();
       if (!seenTokens.contains(tokj)) {
         seenTokens.add(tokj);
         // increment documentFrequency counts
         Integer df = (Integer) documentFrequency.get(tokj);
         if (df == null) documentFrequency.put(tokj, ONE);
         else if (df == ONE) documentFrequency.put(tokj, TWO);
         else if (df == TWO) documentFrequency.put(tokj, THREE);
         else documentFrequency.put(tokj, new Integer(df.intValue() + 1));
       }
     }
     collectionSize++;
   }
 }