コード例 #1
0
ファイル: Blocker.java プロジェクト: IsaacHaze/secondstring
  /** Compute number of correct pairs betwn src1 and src2, where src2>src1 */
  protected int countCorrectPairs(MatchData data) {
    // count the number of times each id appears in each source */
    Map counter = new HashMap();
    for (int i = 0; i < data.numSources(); i++) {
      String src = data.getSource(i);
      for (int j = 0; j < data.numInstances(src); j++) {
        String id = data.getInstance(src, j).getId();
        if (id != null) {
          IdKey key = new IdKey(id, src);
          Integer c = (Integer) counter.get(key);
          counter.put(key, (c == null ? new Integer(1) : new Integer(c.intValue() + 1)));
        }
      }
    }

    /*
    // show the counter
    for (Iterator i=counter.keySet().iterator(); i.hasNext(); ) {
    	IdKey key = (IdKey) i.next();
    	System.out.println( key.src+"#"+key.id+" = "+counter.get(key) );
    }
    */

    // count the number of correct pairs
    int numCorrectPairs = 0;
    Set idsInSrc1 = new HashSet();
    for (int i = 0; i < data.numSources(); i++) {
      String src1 = data.getSource(i);
      idsInSrc1.clear();
      for (int j = 0; j < data.numInstances(src1); j++) {
        String id = data.getInstance(src1, j).getId();
        idsInSrc1.add(id);
        for (int k = i + 1; k < data.numSources(); k++) {
          String src2 = data.getSource(k);
          Integer cInteger = (Integer) counter.get(new IdKey(id, src2));
          if (cInteger != null) {
            numCorrectPairs += cInteger.intValue();
          }
          // System.out.println( "src1:"+src1+" id:"+id+" src2:"+src2+" c:"+cInteger);
        }
      }
      if (clusterMode) {
        // count how often something in src1 can be matched correctly with something
        // else in src1
        for (Iterator j = idsInSrc1.iterator(); j.hasNext(); ) {
          String id = (String) j.next();
          Integer cInteger = (Integer) counter.get(new IdKey(id, src1));
          int c = cInteger.intValue();
          numCorrectPairs += c * (c - 1) / 2;
        }
      }
    }
    return numCorrectPairs;
  }
コード例 #2
0
ファイル: MatchData.java プロジェクト: RenatoEyllo/Eyllo-IR
 public static void main(String[] argv) {
   try {
     MatchData md = new MatchData(argv[0]);
     System.out.println("Dump:");
     System.out.println(md.toString());
     System.out.println();
     System.out.println("Iteration:");
     for (Iterator i = md.getIterator(); i.hasNext(); ) {
       System.out.println(i.next().toString());
     }
   } catch (Exception e) {
     e.printStackTrace();
   }
 }
コード例 #3
0
 /** Accumulate statistics on how often each token value occurs */
 public void train(StringWrapperIterator i) {
   Set seenTokens = new HashSet();
   while (i.hasNext()) {
     BagOfTokens bag = asBagOfTokens(i.nextStringWrapper());
     seenTokens.clear();
     for (Iterator j = bag.tokenIterator(); j.hasNext(); ) {
       totalTokenCount++;
       Token tokj = (Token) j.next();
       if (!seenTokens.contains(tokj)) {
         seenTokens.add(tokj);
         // increment documentFrequency counts
         Integer df = (Integer) documentFrequency.get(tokj);
         if (df == null) documentFrequency.put(tokj, ONE);
         else if (df == ONE) documentFrequency.put(tokj, TWO);
         else if (df == TWO) documentFrequency.put(tokj, THREE);
         else documentFrequency.put(tokj, new Integer(df.intValue() + 1));
       }
     }
     collectionSize++;
   }
 }