/** * Method to check if all the subsets of size k-1 of a candidate of size k are freuqnet * * @param candidate a candidate itemset of size k * @param levelK_1 the frequent itemsets of size k-1 * @return true if all the subsets are frequet */ protected boolean allSubsetsOfSizeK_1AreFrequent(int[] candidate, List<Itemset> levelK_1) { // generate all subsets by always each item from the candidate, one by one for (int posRemoved = 0; posRemoved < candidate.length; posRemoved++) { // perform a binary search to check if the subset appears in level k-1. int first = 0; int last = levelK_1.size() - 1; // variable to remember if we found the subset boolean found = false; // the binary search while (first <= last) { int middle = (first + last) >> 1; // >>1 means to divide by 2 int comparison = ArraysAlgos.sameAs(levelK_1.get(middle).getItems(), candidate, posRemoved); if (comparison < 0) { first = middle + 1; // the itemset compared is larger than the subset according to the lexical // order } else if (comparison > 0) { last = middle - 1; // the itemset compared is smaller than the subset is smaller according to // the lexical order } else { found = true; // we have found it so we stop break; } } if (found == false) { // if we did not find it, that means that candidate is not a frequent itemset // because // at least one of its subsets does not appear in level k-1. return false; } } return true; }
/** * The ApGenRules as described in p.14 of the paper by Agrawal. (see the Agrawal paper for more * details). * * @param k the size of the first itemset used to generate rules * @param m the recursive depth of the call to this method (first time 1, then 2...) * @param lk the itemset that is used to generate rules * @param Hm a set of itemsets that can be used with lk to generate rules * @throws IOException exception if error while writing output file */ private void apGenrules(int k, int m, Itemset lk, List<int[]> Hm) throws IOException { // if the itemset "lk" that is used to generate rules is larger than the size of itemsets in // "Hm" if (k > m + 1) { // Create a list that we will be used to store itemsets for the recursive call List<int[]> Hm_plus_1_for_recursion = new ArrayList<int[]>(); // generate candidates using Hm List<int[]> Hm_plus_1 = generateCandidateSizeK(Hm); // for each such candidates for (int[] hm_P_1 : Hm_plus_1) { // We subtract the candidate from the itemset "lk" int[] itemset_Lk_minus_hm_P_1 = ArraysAlgos.cloneItemSetMinusAnItemset(lk.getItems(), hm_P_1); // We will now calculate the support of the rule Lk/(hm_P_1) ==> hm_P_1 // we need it to calculate the confidence int support = calculateSupport(itemset_Lk_minus_hm_P_1); double supportAsDouble = (double) support; // calculate the confidence of the rule Lk/(hm_P_1) ==> hm_P_1 double conf = lk.getAbsoluteSupport() / supportAsDouble; // if the confidence is not enough than we don't need to consider // the rule Lk/(hm_P_1) ==> hm_P_1 anymore so we continue if (conf < minconf || Double.isInfinite(conf)) { continue; } double lift = 0; int supportHm_P_1 = 0; // if the user is using the minlift threshold, then we will need to calculate the lift of // the // rule as well and check if the lift is higher or equal to minlift. if (usingLift) { // if we want to calculate the lift, we need the support of Hm+1 supportHm_P_1 = calculateSupport(hm_P_1); // calculate the lift of the rule: Lk/(hm_P_1) ==> hm_P_1 double term1 = ((double) lk.getAbsoluteSupport()) / databaseSize; double term2 = (supportAsDouble) / databaseSize; lift = term1 / (term2 * ((double) supportHm_P_1 / databaseSize)); // if the lift is not enough if (lift < minlift) { continue; } } // The rule has passed the confidence and lift threshold requirements, // so we can output it saveRule( itemset_Lk_minus_hm_P_1, support, hm_P_1, supportHm_P_1, lk.getAbsoluteSupport(), conf, lift); // if k == m+1, then we cannot explore further rules using Lk since Lk will be too small. if (k != m + 1) { Hm_plus_1_for_recursion.add(hm_P_1); } } // recursive call to apGenRules to find more rules using "lk" apGenrules(k, m + 1, lk, Hm_plus_1_for_recursion); } }
/** * Run the algorithm for generating association rules from a set of itemsets. * * @param patterns the set of itemsets * @param output the output file path. If null the result is saved in memory and returned by the * method. * @param databaseSize the number of transactions in the original database * @return the set of rules found if the user chose to save the result to memory * @throws IOException exception if error while writting to file */ private AssocRules runAlgorithm(Itemsets patterns, String output, int databaseSize) throws IOException { // if the user want to keep the result into memory if (output == null) { writer = null; rules = new AssocRules("ASSOCIATION RULES"); } else { // if the user want to save the result to a file rules = null; writer = new BufferedWriter(new FileWriter(output)); } this.databaseSize = databaseSize; // record the time when the algorithm starts startTimestamp = System.currentTimeMillis(); // initialize variable to count the number of rules found ruleCount = 0; // save itemsets in a member variable this.patterns = patterns; // SORTING // First, we sort all itemsets having the same size by lexical order // We do this for optimization purposes. If the itemsets are sorted, it allows to // perform two optimizations: // 1) When we need to calculate the support of an itemset (in the method // "calculateSupport()") we can use a binary search instead of browsing the whole list. // 2) When combining itemsets to generate candidate, we can use the // lexical order to avoid comparisons (in the method "generateCandidates()"). // For itemsets of the same size for (List<Itemset> itemsetsSameSize : patterns.getLevels()) { // Sort by lexicographical order using a Comparator Collections.sort( itemsetsSameSize, new Comparator<Itemset>() { @Override public int compare(Itemset o1, Itemset o2) { // The following code assume that itemsets are the same size return ArraysAlgos.comparatorItemsetSameSize.compare(o1.getItems(), o2.getItems()); } }); } // END OF SORTING // Now we will generate the rules. // For each frequent itemset of size >=2 that we will name "lk" for (int k = 2; k < patterns.getLevels().size(); k++) { for (Itemset lk : patterns.getLevels().get(k)) { // create a variable H1 for recursion List<int[]> H1_for_recursion = new ArrayList<int[]>(); // For each itemset "itemsetSize1" of size 1 that is member of lk for (int item : lk.getItems()) { int itemsetHm_P_1[] = new int[] {item}; // make a copy of lk without items from hm_P_1 int[] itemset_Lk_minus_hm_P_1 = ArraysAlgos.cloneItemSetMinusOneItem(lk.getItems(), item); // Now we will calculate the support and confidence // of the rule: itemset_Lk_minus_hm_P_1 ==> hm_P_1 int support = calculateSupport(itemset_Lk_minus_hm_P_1); // THIS COULD BE // OPTIMIZED ? double supportAsDouble = (double) support; // calculate the confidence of the rule : itemset_Lk_minus_hm_P_1 ==> hm_P_1 double conf = lk.getAbsoluteSupport() / supportAsDouble; // if the confidence is lower than minconf if (conf < minconf || Double.isInfinite(conf)) { continue; } double lift = 0; int supportHm_P_1 = 0; // if the user is using the minlift threshold, we will need // to also calculate the lift of the rule: itemset_Lk_minus_hm_P_1 ==> hm_P_1 if (usingLift) { // if we want to calculate the lift, we need the support of hm_P_1 supportHm_P_1 = calculateSupport( itemsetHm_P_1); // if we want to calculate the lift, we need to add this. // calculate the lift double term1 = ((double) lk.getAbsoluteSupport()) / databaseSize; double term2 = supportAsDouble / databaseSize; double term3 = ((double) supportHm_P_1 / databaseSize); lift = term1 / (term2 * term3); // if the lift is not enough if (lift < minlift) { continue; } } // If we are here, it means that the rule respect the minconf and minlift parameters. // Therefore, we output the rule. saveRule( itemset_Lk_minus_hm_P_1, support, itemsetHm_P_1, supportHm_P_1, lk.getAbsoluteSupport(), conf, lift); // Then we keep the itemset hm_P_1 to find more rules using this itemset and lk. H1_for_recursion.add(itemsetHm_P_1); // ================ END OF WHAT I HAVE ADDED } // Finally, we make a recursive call to continue explores rules that can be made with "lk" apGenrules(k, 1, lk, H1_for_recursion); } } // close the file if we saved the result to a file if (writer != null) { writer.close(); } // record the end time of the algorithm execution endTimeStamp = System.currentTimeMillis(); // Return the rules found if the user chose to save the result to memory rather than a file. // Otherwise, null will be returned return rules; }
/** * Try to expand a rule by right expansion only. * * @param ruleG the rule */ private void expandR(RuleG ruleG) { // map to record the potential item to expand the right side of the rule // Key: item Value: bitset indicating the IDs of the transaction containing the item // from the transactions containing the rule. Map<Integer, BitSet> mapCountRight = new HashMap<Integer, BitSet>(); // for each transaction containing the rule for (int tid = ruleG.common.nextSetBit(0); tid >= 0; tid = ruleG.common.nextSetBit(tid + 1)) { // iterate over the items in this transaction Iterator<Integer> iter = database.getTransactions().get(tid).getItems().iterator(); while (iter.hasNext()) { Integer item = iter.next(); // if that item is not frequent, then remove it from the transaction if (tableItemCount[item] < minsuppRelative) { iter.remove(); continue; } // If the item is smaller than the largest item in the right side // of the rule, we can stop this loop because items // are sorted in lexicographical order. if (item < ruleG.maxRight) { break; } // if the item is larger than the maximum item in the right side // and is not contained in the left side of the rule if (item > ruleG.maxRight && !ArraysAlgos.containsLEX(ruleG.getItemset1(), item, ruleG.maxLeft)) { // update the tidset of the item BitSet tidsItem = mapCountRight.get(item); if (tidsItem == null) { tidsItem = new BitSet(); mapCountRight.put(item, tidsItem); } tidsItem.set(tid); } } } // for each item c found in the previous step, we create a rule // I ==> J U {c} if the support is enough for (Entry<Integer, BitSet> entry : mapCountRight.entrySet()) { BitSet tidsRule = entry.getValue(); int ruleSupport = tidsRule.cardinality(); // if the support is enough if (ruleSupport >= minsuppRelative) { Integer itemC = entry.getKey(); // create new right part of rule Integer[] newRightItemset = new Integer[ruleG.getItemset2().length + 1]; System.arraycopy(ruleG.getItemset2(), 0, newRightItemset, 0, ruleG.getItemset2().length); newRightItemset[ruleG.getItemset2().length] = itemC; // recompute maxRight int maxRight = itemC >= ruleG.maxRight ? itemC : ruleG.maxRight; // calculate confidence double confidence = ((double) ruleSupport) / ruleG.tids1.cardinality(); // create the rule RuleG candidate = new RuleG( ruleG.getItemset1(), newRightItemset, ruleSupport, ruleG.tids1, tidsRule, ruleG.maxLeft, maxRight); // if the confidence is enough if (confidence >= minConfidence) { // save the rule to the current top-k rules save(candidate, ruleSupport); } // register the rule as a candidate for future expansion(s) registerAsCandidate(false, candidate); } } }
/** * Try to expand a rule by left and right expansions. * * @param ruleG the rule */ private void expandLR(RuleG ruleG) { // Maps to record the potential item to expand the left/right sides of the rule // Key: item Value: bitset indicating the IDs of the transaction containing the item // from the transactions containing the rule. Map<Integer, BitSet> mapCountLeft = new HashMap<Integer, BitSet>(); Map<Integer, BitSet> mapCountRight = new HashMap<Integer, BitSet>(); for (int tid = ruleG.common.nextSetBit(0); tid >= 0; tid = ruleG.common.nextSetBit(tid + 1)) { Iterator<Integer> iter = database.getTransactions().get(tid).getItems().iterator(); while (iter.hasNext()) { Integer item = iter.next(); // CAN DO THIS BECAUSE TRANSACTIONS ARE SORTED BY DESCENDING // ITEM IDS (see Database.Java) if (item < ruleG.maxLeft && item < ruleG.maxRight) { // break; } if (tableItemCount[item] < minsuppRelative) { iter.remove(); continue; } if (item > ruleG.maxLeft && !ArraysAlgos.containsLEX(ruleG.getItemset2(), item, ruleG.maxRight)) { BitSet tidsItem = mapCountLeft.get(item); if (tidsItem == null) { tidsItem = new BitSet(); mapCountLeft.put(item, tidsItem); } tidsItem.set(tid); } if (item > ruleG.maxRight && !ArraysAlgos.containsLEX(ruleG.getItemset1(), item, ruleG.maxLeft)) { BitSet tidsItem = mapCountRight.get(item); if (tidsItem == null) { tidsItem = new BitSet(); mapCountRight.put(item, tidsItem); } tidsItem.set(tid); } } } // for each item c found in the previous step, we create a rule // I ==> J U {c} if the support is enough for (Entry<Integer, BitSet> entry : mapCountRight.entrySet()) { BitSet tidsRule = entry.getValue(); int ruleSupport = tidsRule.cardinality(); // if the support is enough if (ruleSupport >= minsuppRelative) { Integer itemC = entry.getKey(); // create new right part of rule Integer[] newRightItemset = new Integer[ruleG.getItemset2().length + 1]; System.arraycopy(ruleG.getItemset2(), 0, newRightItemset, 0, ruleG.getItemset2().length); newRightItemset[ruleG.getItemset2().length] = itemC; // recompute maxRight int maxRight = (itemC >= ruleG.maxRight) ? itemC : ruleG.maxRight; // calculate the confidence of the rule double confidence = ((double) ruleSupport) / ruleG.tids1.cardinality(); // create the rule RuleG candidate = new RuleG( ruleG.getItemset1(), newRightItemset, ruleSupport, ruleG.tids1, tidsRule, ruleG.maxLeft, maxRight); // if the confidence is enough if (confidence >= minConfidence) { // save the rule in current top-k rules save(candidate, ruleSupport); } // register the rule as a candidate for future expansion registerAsCandidate(false, candidate); } } // for each item c found in the previous step, we create a rule // I U {c} ==> J if the support is enough for (Entry<Integer, BitSet> entry : mapCountLeft.entrySet()) { BitSet tidsRule = entry.getValue(); int ruleSupport = tidsRule.cardinality(); // if the support is enough if (ruleSupport >= minsuppRelative) { Integer itemC = entry.getKey(); // The tidset of the left itemset is calculated BitSet tidsLeft = (BitSet) ruleG.tids1.clone(); tidsLeft.and(tableItemTids[itemC]); // create new left part of rule Integer[] newLeftItemset = new Integer[ruleG.getItemset1().length + 1]; System.arraycopy(ruleG.getItemset1(), 0, newLeftItemset, 0, ruleG.getItemset1().length); newLeftItemset[ruleG.getItemset1().length] = itemC; // recompute maxLeft int maxLeft = itemC >= ruleG.maxLeft ? itemC : ruleG.maxLeft; // calculate the confidence of the rule double confidence = ((double) ruleSupport) / tidsLeft.cardinality(); // create the rule RuleG candidate = new RuleG( newLeftItemset, ruleG.getItemset2(), ruleSupport, tidsLeft, tidsRule, maxLeft, ruleG.maxRight); // if the confidence is high enough if (confidence >= minConfidence) { // save the rule to the top-k rules save(candidate, ruleSupport); } // register the rule as a candidate for further expansions registerAsCandidate(true, candidate); } } }