Beispiel #1
0
  /**
   * Method that finds all large itemsets for the given set of instances.
   *
   * @param the instances to be used
   * @exception Exception if an attribute is numeric
   */
  private void findLargeItemSets(int index) throws Exception {

    FastVector kMinusOneSets, kSets = new FastVector();
    Hashtable hashtable;
    int i = 0;
    // Find large itemsets
    // of length 1
    if (index == 1) {
      kSets = ItemSet.singletons(m_instances);
      ItemSet.upDateCounters(kSets, m_instances);
      kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE);
      if (kSets.size() == 0) return;
      m_Ls.addElement(kSets);
    }
    // of length > 1
    if (index > 1) {
      if (m_Ls.size() > 0) kSets = (FastVector) m_Ls.lastElement();
      m_Ls.removeAllElements();
      i = index - 2;
      kMinusOneSets = kSets;
      kSets = ItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances());
      hashtable = ItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size());
      m_hashtables.addElement(hashtable);
      kSets = ItemSet.pruneItemSets(kSets, hashtable);
      ItemSet.upDateCounters(kSets, m_instances);
      kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE);
      if (kSets.size() == 0) return;
      m_Ls.addElement(kSets);
    }
  }
  public Instances initializeInstances() {

    FastVector wekaAttributes = buildCosineAttributes();
    Attribute label = (Attribute) wekaAttributes.lastElement();

    Instances data = new Instances("semantic-space", wekaAttributes, 1000);
    data.setClass(label);

    return data;
  }
Beispiel #3
0
  /**
   * Add a rule to the ruleset and update the stats
   *
   * @param lastRule the rule to be added
   */
  public void addAndUpdate(Rule lastRule) {
    if (m_Ruleset == null) m_Ruleset = new FastVector();
    m_Ruleset.addElement(lastRule);

    Instances data = (m_Filtered == null) ? m_Data : ((Instances[]) m_Filtered.lastElement())[1];
    double[] stats = new double[6];
    double[] classCounts = new double[m_Data.classAttribute().numValues()];
    Instances[] filtered = computeSimpleStats(m_Ruleset.size() - 1, data, stats, classCounts);

    if (m_Filtered == null) m_Filtered = new FastVector();
    m_Filtered.addElement(filtered);

    if (m_SimpleStats == null) m_SimpleStats = new FastVector();
    m_SimpleStats.addElement(stats);

    if (m_Distributions == null) m_Distributions = new FastVector();
    m_Distributions.addElement(classCounts);
  }
  public Instance buildWekaInstance(QAPair pair) {

    double[] query = projector.transform(pair.getQueryList());
    double[] answer = projector.transform(pair.getAnswerList());
    double[] cosine = {projector.computeCosignSimilarity(query, answer), 0.0};

    FastVector attributes = buildCosineAttributes();
    Attribute label = (Attribute) attributes.lastElement();

    Instances testInstances = new Instances("test", attributes, 1);
    testInstances.setClass(label);

    Instance example = new Instance(1, cosine);
    testInstances.add(example);
    example.setDataset(testInstances);

    if (!pair.getLabel().equals("-1")) {
      example.setClassValue(pair.getLabel());
    } else {
      example.setClassMissing();
    }
    return example;
  }
Beispiel #5
0
  /**
   * Compute the combined DL of the ruleset in this class, i.e. theory DL and data DL. Note this
   * procedure computes the combined DL according to the current status of the ruleset in this class
   *
   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
   * @param predicted the default classification if ruleset covers null
   * @return the combined class
   */
  public double combinedDL(double expFPRate, double predicted) {
    double rt = 0;

    if (getRulesetSize() > 0) {
      double[] stats = (double[]) m_SimpleStats.lastElement();
      for (int j = getRulesetSize() - 2; j >= 0; j--) {
        stats[0] += getSimpleStats(j)[0];
        stats[2] += getSimpleStats(j)[2];
        stats[4] += getSimpleStats(j)[4];
      }
      rt += dataDL(expFPRate, stats[0], stats[1], stats[4], stats[5]); // Data DL
    } else { // Null coverage ruleset
      double fn = 0.0;
      for (int j = 0; j < m_Data.numInstances(); j++)
        if ((int) m_Data.instance(j).classValue() == (int) predicted)
          fn += m_Data.instance(j).weight();
      rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn);
    }

    for (int i = 0; i < getRulesetSize(); i++) // Theory DL
    rt += theoryDL(i);

    return rt;
  }
Beispiel #6
0
  /**
   * Method that generates all large itemsets with a minimum support, and from these all association
   * rules.
   *
   * @param instances the instances to be used for generating the associations
   * @exception Exception if rules can't be built successfully
   */
  public void buildAssociations(Instances instances) throws Exception {

    int temp = m_premiseCount, exactNumber = m_numRules - 5;

    if (instances.checkForStringAttributes()) {
      throw new Exception("Can't handle string attributes!");
    }
    m_instances = instances;
    m_instances.setClassIndex(m_instances.numAttributes() - 1);

    // prior estimation
    m_priorEstimator = new PriorEstimation(m_instances, m_numRandRules, m_numIntervals, false);
    m_priors = m_priorEstimator.estimatePrior();
    m_midPoints = m_priorEstimator.getMidPoints();

    m_Ls = new FastVector();
    m_hashtables = new FastVector();

    for (int i = 1; i < m_instances.numAttributes(); i++) {
      m_bestChanged = false;

      // find large item sets
      findLargeItemSets(i);

      // find association rules (rule generation procedure)
      findRulesQuickly();

      if (m_bestChanged) {
        temp = m_premiseCount;
        while (RuleGeneration.expectation(m_premiseCount, m_premiseCount, m_midPoints, m_priors)
            <= m_expectation) {
          m_premiseCount++;
          if (m_premiseCount > m_instances.numInstances()) break;
        }
      }
      if (m_premiseCount > m_instances.numInstances()) {

        // Reserve space for variables
        m_allTheRules = new FastVector[3];
        m_allTheRules[0] = new FastVector();
        m_allTheRules[1] = new FastVector();
        m_allTheRules[2] = new FastVector();

        int k = 0;
        while (m_best.size() > 0 && exactNumber > 0) {
          m_allTheRules[0].insertElementAt((ItemSet) ((RuleItem) m_best.last()).premise(), k);
          m_allTheRules[1].insertElementAt((ItemSet) ((RuleItem) m_best.last()).consequence(), k);
          m_allTheRules[2].insertElementAt(new Double(((RuleItem) m_best.last()).accuracy()), k);
          boolean remove = m_best.remove(m_best.last());
          k++;
          exactNumber--;
        }
        return;
      }

      if (temp != m_premiseCount && m_Ls.size() > 0) {
        FastVector kSets = (FastVector) m_Ls.lastElement();
        m_Ls.removeElementAt(m_Ls.size() - 1);
        kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE);
        m_Ls.addElement(kSets);
      }
    }

    // Reserve space for variables
    m_allTheRules = new FastVector[3];
    m_allTheRules[0] = new FastVector();
    m_allTheRules[1] = new FastVector();
    m_allTheRules[2] = new FastVector();

    int k = 0;
    while (m_best.size() > 0 && exactNumber > 0) {
      m_allTheRules[0].insertElementAt((ItemSet) ((RuleItem) m_best.last()).premise(), k);
      m_allTheRules[1].insertElementAt((ItemSet) ((RuleItem) m_best.last()).consequence(), k);
      m_allTheRules[2].insertElementAt(new Double(((RuleItem) m_best.last()).accuracy()), k);
      boolean remove = m_best.remove(m_best.last());
      k++;
      exactNumber--;
    }
  }
Beispiel #7
0
  /**
   * Compute the minimal data description length of the ruleset if the rule in the given position is
   * deleted.<br>
   * The min_data_DL_if_deleted = data_DL_if_deleted - potential
   *
   * @param index the index of the rule in question
   * @param expFPRate expected FP/(FP+FN), used in dataDL calculation
   * @param checkErr whether check if error rate >= 0.5
   * @return the minDataDL
   */
  public double minDataDLIfDeleted(int index, double expFPRate, boolean checkErr) {
    // System.out.println("!!!Enter without: ");
    double[] rulesetStat = new double[6]; // Stats of ruleset if deleted
    int more = m_Ruleset.size() - 1 - index; // How many rules after?
    FastVector indexPlus = new FastVector(more); // Their stats

    // 0...(index-1) are OK
    for (int j = 0; j < index; j++) {
      // Covered stats are cumulative
      rulesetStat[0] += ((double[]) m_SimpleStats.elementAt(j))[0];
      rulesetStat[2] += ((double[]) m_SimpleStats.elementAt(j))[2];
      rulesetStat[4] += ((double[]) m_SimpleStats.elementAt(j))[4];
    }

    // Recount data from index+1
    Instances data = (index == 0) ? m_Data : ((Instances[]) m_Filtered.elementAt(index - 1))[1];
    // System.out.println("!!!without: " + data.sumOfWeights());

    for (int j = (index + 1); j < m_Ruleset.size(); j++) {
      double[] stats = new double[6];
      Instances[] split = computeSimpleStats(j, data, stats, null);
      indexPlus.addElement(stats);
      rulesetStat[0] += stats[0];
      rulesetStat[2] += stats[2];
      rulesetStat[4] += stats[4];
      data = split[1];
    }
    // Uncovered stats are those of the last rule
    if (more > 0) {
      rulesetStat[1] = ((double[]) indexPlus.lastElement())[1];
      rulesetStat[3] = ((double[]) indexPlus.lastElement())[3];
      rulesetStat[5] = ((double[]) indexPlus.lastElement())[5];
    } else if (index > 0) {
      rulesetStat[1] = ((double[]) m_SimpleStats.elementAt(index - 1))[1];
      rulesetStat[3] = ((double[]) m_SimpleStats.elementAt(index - 1))[3];
      rulesetStat[5] = ((double[]) m_SimpleStats.elementAt(index - 1))[5];
    } else { // Null coverage
      rulesetStat[1] =
          ((double[]) m_SimpleStats.elementAt(0))[0] + ((double[]) m_SimpleStats.elementAt(0))[1];
      rulesetStat[3] =
          ((double[]) m_SimpleStats.elementAt(0))[3] + ((double[]) m_SimpleStats.elementAt(0))[4];
      rulesetStat[5] =
          ((double[]) m_SimpleStats.elementAt(0))[2] + ((double[]) m_SimpleStats.elementAt(0))[5];
    }

    // Potential
    double potential = 0;
    for (int k = index + 1; k < m_Ruleset.size(); k++) {
      double[] ruleStat = (double[]) indexPlus.elementAt(k - index - 1);
      double ifDeleted = potential(k, expFPRate, rulesetStat, ruleStat, checkErr);
      if (!Double.isNaN(ifDeleted)) potential += ifDeleted;
    }

    // Data DL of the ruleset without the rule
    // Note that ruleset stats has already been updated to reflect
    // deletion if any potential
    double dataDLWithout =
        dataDL(expFPRate, rulesetStat[0], rulesetStat[1], rulesetStat[4], rulesetStat[5]);
    // System.out.println("!!!without: "+dataDLWithout + " |potential: "+
    //		   potential);
    // Why subtract potential again?  To reflect change of theory DL??
    return (dataDLWithout - potential);
  }