/** * Method that finds all large itemsets for the given set of instances. * * @param the instances to be used * @exception Exception if an attribute is numeric */ private void findLargeItemSets(int index) throws Exception { FastVector kMinusOneSets, kSets = new FastVector(); Hashtable hashtable; int i = 0; // Find large itemsets // of length 1 if (index == 1) { kSets = ItemSet.singletons(m_instances); ItemSet.upDateCounters(kSets, m_instances); kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE); if (kSets.size() == 0) return; m_Ls.addElement(kSets); } // of length > 1 if (index > 1) { if (m_Ls.size() > 0) kSets = (FastVector) m_Ls.lastElement(); m_Ls.removeAllElements(); i = index - 2; kMinusOneSets = kSets; kSets = ItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances()); hashtable = ItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size()); m_hashtables.addElement(hashtable); kSets = ItemSet.pruneItemSets(kSets, hashtable); ItemSet.upDateCounters(kSets, m_instances); kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE); if (kSets.size() == 0) return; m_Ls.addElement(kSets); } }
public Instances initializeInstances() { FastVector wekaAttributes = buildCosineAttributes(); Attribute label = (Attribute) wekaAttributes.lastElement(); Instances data = new Instances("semantic-space", wekaAttributes, 1000); data.setClass(label); return data; }
/** * Add a rule to the ruleset and update the stats * * @param lastRule the rule to be added */ public void addAndUpdate(Rule lastRule) { if (m_Ruleset == null) m_Ruleset = new FastVector(); m_Ruleset.addElement(lastRule); Instances data = (m_Filtered == null) ? m_Data : ((Instances[]) m_Filtered.lastElement())[1]; double[] stats = new double[6]; double[] classCounts = new double[m_Data.classAttribute().numValues()]; Instances[] filtered = computeSimpleStats(m_Ruleset.size() - 1, data, stats, classCounts); if (m_Filtered == null) m_Filtered = new FastVector(); m_Filtered.addElement(filtered); if (m_SimpleStats == null) m_SimpleStats = new FastVector(); m_SimpleStats.addElement(stats); if (m_Distributions == null) m_Distributions = new FastVector(); m_Distributions.addElement(classCounts); }
public Instance buildWekaInstance(QAPair pair) { double[] query = projector.transform(pair.getQueryList()); double[] answer = projector.transform(pair.getAnswerList()); double[] cosine = {projector.computeCosignSimilarity(query, answer), 0.0}; FastVector attributes = buildCosineAttributes(); Attribute label = (Attribute) attributes.lastElement(); Instances testInstances = new Instances("test", attributes, 1); testInstances.setClass(label); Instance example = new Instance(1, cosine); testInstances.add(example); example.setDataset(testInstances); if (!pair.getLabel().equals("-1")) { example.setClassValue(pair.getLabel()); } else { example.setClassMissing(); } return example; }
/** * Compute the combined DL of the ruleset in this class, i.e. theory DL and data DL. Note this * procedure computes the combined DL according to the current status of the ruleset in this class * * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param predicted the default classification if ruleset covers null * @return the combined class */ public double combinedDL(double expFPRate, double predicted) { double rt = 0; if (getRulesetSize() > 0) { double[] stats = (double[]) m_SimpleStats.lastElement(); for (int j = getRulesetSize() - 2; j >= 0; j--) { stats[0] += getSimpleStats(j)[0]; stats[2] += getSimpleStats(j)[2]; stats[4] += getSimpleStats(j)[4]; } rt += dataDL(expFPRate, stats[0], stats[1], stats[4], stats[5]); // Data DL } else { // Null coverage ruleset double fn = 0.0; for (int j = 0; j < m_Data.numInstances(); j++) if ((int) m_Data.instance(j).classValue() == (int) predicted) fn += m_Data.instance(j).weight(); rt += dataDL(expFPRate, 0.0, m_Data.sumOfWeights(), 0.0, fn); } for (int i = 0; i < getRulesetSize(); i++) // Theory DL rt += theoryDL(i); return rt; }
/** * Method that generates all large itemsets with a minimum support, and from these all association * rules. * * @param instances the instances to be used for generating the associations * @exception Exception if rules can't be built successfully */ public void buildAssociations(Instances instances) throws Exception { int temp = m_premiseCount, exactNumber = m_numRules - 5; if (instances.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } m_instances = instances; m_instances.setClassIndex(m_instances.numAttributes() - 1); // prior estimation m_priorEstimator = new PriorEstimation(m_instances, m_numRandRules, m_numIntervals, false); m_priors = m_priorEstimator.estimatePrior(); m_midPoints = m_priorEstimator.getMidPoints(); m_Ls = new FastVector(); m_hashtables = new FastVector(); for (int i = 1; i < m_instances.numAttributes(); i++) { m_bestChanged = false; // find large item sets findLargeItemSets(i); // find association rules (rule generation procedure) findRulesQuickly(); if (m_bestChanged) { temp = m_premiseCount; while (RuleGeneration.expectation(m_premiseCount, m_premiseCount, m_midPoints, m_priors) <= m_expectation) { m_premiseCount++; if (m_premiseCount > m_instances.numInstances()) break; } } if (m_premiseCount > m_instances.numInstances()) { // Reserve space for variables m_allTheRules = new FastVector[3]; m_allTheRules[0] = new FastVector(); m_allTheRules[1] = new FastVector(); m_allTheRules[2] = new FastVector(); int k = 0; while (m_best.size() > 0 && exactNumber > 0) { m_allTheRules[0].insertElementAt((ItemSet) ((RuleItem) m_best.last()).premise(), k); m_allTheRules[1].insertElementAt((ItemSet) ((RuleItem) m_best.last()).consequence(), k); m_allTheRules[2].insertElementAt(new Double(((RuleItem) m_best.last()).accuracy()), k); boolean remove = m_best.remove(m_best.last()); k++; exactNumber--; } return; } if (temp != m_premiseCount && m_Ls.size() > 0) { FastVector kSets = (FastVector) m_Ls.lastElement(); m_Ls.removeElementAt(m_Ls.size() - 1); kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE); m_Ls.addElement(kSets); } } // Reserve space for variables m_allTheRules = new FastVector[3]; m_allTheRules[0] = new FastVector(); m_allTheRules[1] = new FastVector(); m_allTheRules[2] = new FastVector(); int k = 0; while (m_best.size() > 0 && exactNumber > 0) { m_allTheRules[0].insertElementAt((ItemSet) ((RuleItem) m_best.last()).premise(), k); m_allTheRules[1].insertElementAt((ItemSet) ((RuleItem) m_best.last()).consequence(), k); m_allTheRules[2].insertElementAt(new Double(((RuleItem) m_best.last()).accuracy()), k); boolean remove = m_best.remove(m_best.last()); k++; exactNumber--; } }
/** * Compute the minimal data description length of the ruleset if the rule in the given position is * deleted.<br> * The min_data_DL_if_deleted = data_DL_if_deleted - potential * * @param index the index of the rule in question * @param expFPRate expected FP/(FP+FN), used in dataDL calculation * @param checkErr whether check if error rate >= 0.5 * @return the minDataDL */ public double minDataDLIfDeleted(int index, double expFPRate, boolean checkErr) { // System.out.println("!!!Enter without: "); double[] rulesetStat = new double[6]; // Stats of ruleset if deleted int more = m_Ruleset.size() - 1 - index; // How many rules after? FastVector indexPlus = new FastVector(more); // Their stats // 0...(index-1) are OK for (int j = 0; j < index; j++) { // Covered stats are cumulative rulesetStat[0] += ((double[]) m_SimpleStats.elementAt(j))[0]; rulesetStat[2] += ((double[]) m_SimpleStats.elementAt(j))[2]; rulesetStat[4] += ((double[]) m_SimpleStats.elementAt(j))[4]; } // Recount data from index+1 Instances data = (index == 0) ? m_Data : ((Instances[]) m_Filtered.elementAt(index - 1))[1]; // System.out.println("!!!without: " + data.sumOfWeights()); for (int j = (index + 1); j < m_Ruleset.size(); j++) { double[] stats = new double[6]; Instances[] split = computeSimpleStats(j, data, stats, null); indexPlus.addElement(stats); rulesetStat[0] += stats[0]; rulesetStat[2] += stats[2]; rulesetStat[4] += stats[4]; data = split[1]; } // Uncovered stats are those of the last rule if (more > 0) { rulesetStat[1] = ((double[]) indexPlus.lastElement())[1]; rulesetStat[3] = ((double[]) indexPlus.lastElement())[3]; rulesetStat[5] = ((double[]) indexPlus.lastElement())[5]; } else if (index > 0) { rulesetStat[1] = ((double[]) m_SimpleStats.elementAt(index - 1))[1]; rulesetStat[3] = ((double[]) m_SimpleStats.elementAt(index - 1))[3]; rulesetStat[5] = ((double[]) m_SimpleStats.elementAt(index - 1))[5]; } else { // Null coverage rulesetStat[1] = ((double[]) m_SimpleStats.elementAt(0))[0] + ((double[]) m_SimpleStats.elementAt(0))[1]; rulesetStat[3] = ((double[]) m_SimpleStats.elementAt(0))[3] + ((double[]) m_SimpleStats.elementAt(0))[4]; rulesetStat[5] = ((double[]) m_SimpleStats.elementAt(0))[2] + ((double[]) m_SimpleStats.elementAt(0))[5]; } // Potential double potential = 0; for (int k = index + 1; k < m_Ruleset.size(); k++) { double[] ruleStat = (double[]) indexPlus.elementAt(k - index - 1); double ifDeleted = potential(k, expFPRate, rulesetStat, ruleStat, checkErr); if (!Double.isNaN(ifDeleted)) potential += ifDeleted; } // Data DL of the ruleset without the rule // Note that ruleset stats has already been updated to reflect // deletion if any potential double dataDLWithout = dataDL(expFPRate, rulesetStat[0], rulesetStat[1], rulesetStat[4], rulesetStat[5]); // System.out.println("!!!without: "+dataDLWithout + " |potential: "+ // potential); // Why subtract potential again? To reflect change of theory DL?? return (dataDLWithout - potential); }