Beispiel #1
0
  /**
   * Method to check if all the subsets of size k-1 of a candidate of size k are freuqnet
   *
   * @param candidate a candidate itemset of size k
   * @param levelK_1 the frequent itemsets of size k-1
   * @return true if all the subsets are frequet
   */
  protected boolean allSubsetsOfSizeK_1AreFrequent(int[] candidate, List<Itemset> levelK_1) {
    // generate all subsets by always each item from the candidate, one by one
    for (int posRemoved = 0; posRemoved < candidate.length; posRemoved++) {

      // perform a binary search to check if  the subset appears in  level k-1.
      int first = 0;
      int last = levelK_1.size() - 1;

      // variable to remember if we found the subset
      boolean found = false;
      // the binary search
      while (first <= last) {
        int middle = (first + last) >> 1; // >>1 means to divide by 2

        int comparison = ArraysAlgos.sameAs(levelK_1.get(middle).getItems(), candidate, posRemoved);
        if (comparison < 0) {
          first =
              middle
                  + 1; //  the itemset compared is larger than the subset according to the lexical
                       // order
        } else if (comparison > 0) {
          last =
              middle
                  - 1; //  the itemset compared is smaller than the subset  is smaller according to
                       // the lexical order
        } else {
          found = true; //  we have found it so we stop
          break;
        }
      }

      if (found
          == false) { // if we did not find it, that means that candidate is not a frequent itemset
                      // because
        // at least one of its subsets does not appear in level k-1.
        return false;
      }
    }
    return true;
  }
  /**
   * The ApGenRules as described in p.14 of the paper by Agrawal. (see the Agrawal paper for more
   * details).
   *
   * @param k the size of the first itemset used to generate rules
   * @param m the recursive depth of the call to this method (first time 1, then 2...)
   * @param lk the itemset that is used to generate rules
   * @param Hm a set of itemsets that can be used with lk to generate rules
   * @throws IOException exception if error while writing output file
   */
  private void apGenrules(int k, int m, Itemset lk, List<int[]> Hm) throws IOException {

    // if the itemset "lk" that is used to generate rules is larger than the size of itemsets in
    // "Hm"
    if (k > m + 1) {
      // Create a list that we will be used to store itemsets for the recursive call
      List<int[]> Hm_plus_1_for_recursion = new ArrayList<int[]>();

      // generate candidates using Hm
      List<int[]> Hm_plus_1 = generateCandidateSizeK(Hm);

      // for each such candidates
      for (int[] hm_P_1 : Hm_plus_1) {

        // We subtract the candidate from the itemset "lk"
        int[] itemset_Lk_minus_hm_P_1 =
            ArraysAlgos.cloneItemSetMinusAnItemset(lk.getItems(), hm_P_1);

        // We will now calculate the support of the rule  Lk/(hm_P_1) ==> hm_P_1
        // we need it to calculate the confidence
        int support = calculateSupport(itemset_Lk_minus_hm_P_1);

        double supportAsDouble = (double) support;

        // calculate the confidence of the rule Lk/(hm_P_1) ==> hm_P_1
        double conf = lk.getAbsoluteSupport() / supportAsDouble;

        // if the confidence is not enough than we don't need to consider
        // the rule  Lk/(hm_P_1) ==> hm_P_1 anymore so we continue
        if (conf < minconf || Double.isInfinite(conf)) {
          continue;
        }

        double lift = 0;
        int supportHm_P_1 = 0;
        // if the user is using the minlift threshold, then we will need to calculate the lift of
        // the
        // rule as well and check if the lift is higher or equal to minlift.
        if (usingLift) {
          // if we want to calculate the lift, we need the support of Hm+1
          supportHm_P_1 = calculateSupport(hm_P_1);
          // calculate the lift of the rule:  Lk/(hm_P_1) ==> hm_P_1
          double term1 = ((double) lk.getAbsoluteSupport()) / databaseSize;
          double term2 = (supportAsDouble) / databaseSize;

          lift = term1 / (term2 * ((double) supportHm_P_1 / databaseSize));

          // if the lift is not enough
          if (lift < minlift) {
            continue;
          }
        }

        // The rule has passed the confidence and lift threshold requirements,
        // so we can output it
        saveRule(
            itemset_Lk_minus_hm_P_1,
            support,
            hm_P_1,
            supportHm_P_1,
            lk.getAbsoluteSupport(),
            conf,
            lift);

        // if k == m+1, then we cannot explore further rules using Lk since Lk will be too small.
        if (k != m + 1) {
          Hm_plus_1_for_recursion.add(hm_P_1);
        }
      }
      // recursive call to apGenRules to find more rules using "lk"
      apGenrules(k, m + 1, lk, Hm_plus_1_for_recursion);
    }
  }
  /**
   * Run the algorithm for generating association rules from a set of itemsets.
   *
   * @param patterns the set of itemsets
   * @param output the output file path. If null the result is saved in memory and returned by the
   *     method.
   * @param databaseSize the number of transactions in the original database
   * @return the set of rules found if the user chose to save the result to memory
   * @throws IOException exception if error while writting to file
   */
  private AssocRules runAlgorithm(Itemsets patterns, String output, int databaseSize)
      throws IOException {

    // if the user want to keep the result into memory
    if (output == null) {
      writer = null;
      rules = new AssocRules("ASSOCIATION RULES");
    } else {
      // if the user want to save the result to a file
      rules = null;
      writer = new BufferedWriter(new FileWriter(output));
    }

    this.databaseSize = databaseSize;

    // record the time when the algorithm starts
    startTimestamp = System.currentTimeMillis();
    // initialize variable to count the number of rules found
    ruleCount = 0;
    // save itemsets in a member variable
    this.patterns = patterns;

    // SORTING
    // First, we sort all itemsets having the same size by lexical order
    // We do this for optimization purposes. If the itemsets are sorted, it allows to
    // perform two optimizations:
    // 1) When we need to calculate the support of an itemset (in the method
    // "calculateSupport()") we can use a binary search instead of browsing the whole list.
    // 2) When combining itemsets to generate candidate, we can use the
    //    lexical order to avoid comparisons (in the method "generateCandidates()").

    // For itemsets of the same size
    for (List<Itemset> itemsetsSameSize : patterns.getLevels()) {
      // Sort by lexicographical order using a Comparator
      Collections.sort(
          itemsetsSameSize,
          new Comparator<Itemset>() {
            @Override
            public int compare(Itemset o1, Itemset o2) {
              // The following code assume that itemsets are the same size
              return ArraysAlgos.comparatorItemsetSameSize.compare(o1.getItems(), o2.getItems());
            }
          });
    }
    // END OF SORTING

    // Now we will generate the rules.

    // For each frequent itemset of size >=2 that we will name "lk"
    for (int k = 2; k < patterns.getLevels().size(); k++) {
      for (Itemset lk : patterns.getLevels().get(k)) {

        // create a variable H1 for recursion
        List<int[]> H1_for_recursion = new ArrayList<int[]>();

        // For each itemset "itemsetSize1" of size 1 that is member of lk
        for (int item : lk.getItems()) {
          int itemsetHm_P_1[] = new int[] {item};

          // make a copy of  lk without items from  hm_P_1
          int[] itemset_Lk_minus_hm_P_1 = ArraysAlgos.cloneItemSetMinusOneItem(lk.getItems(), item);

          // Now we will calculate the support and confidence
          // of the rule: itemset_Lk_minus_hm_P_1 ==>  hm_P_1
          int support = calculateSupport(itemset_Lk_minus_hm_P_1); // THIS COULD BE
          // OPTIMIZED ?
          double supportAsDouble = (double) support;

          // calculate the confidence of the rule : itemset_Lk_minus_hm_P_1 ==>  hm_P_1
          double conf = lk.getAbsoluteSupport() / supportAsDouble;

          // if the confidence is lower than minconf
          if (conf < minconf || Double.isInfinite(conf)) {
            continue;
          }

          double lift = 0;
          int supportHm_P_1 = 0;
          // if the user is using the minlift threshold, we will need
          // to also calculate the lift of the rule:  itemset_Lk_minus_hm_P_1 ==>  hm_P_1
          if (usingLift) {
            // if we want to calculate the lift, we need the support of hm_P_1
            supportHm_P_1 =
                calculateSupport(
                    itemsetHm_P_1); // if we want to calculate the lift, we need to add this.
            // calculate the lift
            double term1 = ((double) lk.getAbsoluteSupport()) / databaseSize;
            double term2 = supportAsDouble / databaseSize;
            double term3 = ((double) supportHm_P_1 / databaseSize);
            lift = term1 / (term2 * term3);

            // if the lift is not enough
            if (lift < minlift) {
              continue;
            }
          }

          // If we are here, it means that the rule respect the minconf and minlift parameters.
          // Therefore, we output the rule.
          saveRule(
              itemset_Lk_minus_hm_P_1,
              support,
              itemsetHm_P_1,
              supportHm_P_1,
              lk.getAbsoluteSupport(),
              conf,
              lift);

          // Then we keep the itemset  hm_P_1 to find more rules using this itemset and lk.
          H1_for_recursion.add(itemsetHm_P_1);
          // ================ END OF WHAT I HAVE ADDED
        }

        // Finally, we make a recursive call to continue explores rules that can be made with "lk"
        apGenrules(k, 1, lk, H1_for_recursion);
      }
    }

    // close the file if we saved the result to a file
    if (writer != null) {
      writer.close();
    }
    // record the end time of the algorithm execution
    endTimeStamp = System.currentTimeMillis();

    // Return the rules found if the user chose to save the result to memory rather than a file.
    // Otherwise, null will be returned
    return rules;
  }
  /**
   * Try to expand a rule by right expansion only.
   *
   * @param ruleG the rule
   */
  private void expandR(RuleG ruleG) {
    // map to record the potential item to expand the right side of the rule
    // Key: item   Value: bitset indicating the IDs of the transaction containing the item
    // from the transactions containing the rule.
    Map<Integer, BitSet> mapCountRight = new HashMap<Integer, BitSet>();

    // for each transaction containing the rule
    for (int tid = ruleG.common.nextSetBit(0); tid >= 0; tid = ruleG.common.nextSetBit(tid + 1)) {

      // iterate over the items in this transaction
      Iterator<Integer> iter = database.getTransactions().get(tid).getItems().iterator();
      while (iter.hasNext()) {
        Integer item = iter.next();

        // if  that item is not frequent, then remove it from the transaction
        if (tableItemCount[item] < minsuppRelative) {
          iter.remove();
          continue;
        }

        // If the item is smaller than the largest item in the right side
        // of the rule, we can stop this loop because items
        // are sorted in lexicographical order.
        if (item < ruleG.maxRight) {
          break;
        }

        // if the item is larger than the maximum item in the right side
        // and is not contained in the left side of the rule
        if (item > ruleG.maxRight
            && !ArraysAlgos.containsLEX(ruleG.getItemset1(), item, ruleG.maxLeft)) {

          // update the tidset of the item
          BitSet tidsItem = mapCountRight.get(item);
          if (tidsItem == null) {
            tidsItem = new BitSet();
            mapCountRight.put(item, tidsItem);
          }
          tidsItem.set(tid);
        }
      }
    }

    // for each item c found in the previous step, we create a rule
    // I ==> J U {c} if the support is enough
    for (Entry<Integer, BitSet> entry : mapCountRight.entrySet()) {
      BitSet tidsRule = entry.getValue();
      int ruleSupport = tidsRule.cardinality();

      // if the support is enough
      if (ruleSupport >= minsuppRelative) {
        Integer itemC = entry.getKey();

        // create new right part of rule
        Integer[] newRightItemset = new Integer[ruleG.getItemset2().length + 1];
        System.arraycopy(ruleG.getItemset2(), 0, newRightItemset, 0, ruleG.getItemset2().length);
        newRightItemset[ruleG.getItemset2().length] = itemC;

        // recompute maxRight
        int maxRight = itemC >= ruleG.maxRight ? itemC : ruleG.maxRight;

        // calculate confidence
        double confidence = ((double) ruleSupport) / ruleG.tids1.cardinality();

        // create the rule
        RuleG candidate =
            new RuleG(
                ruleG.getItemset1(),
                newRightItemset,
                ruleSupport,
                ruleG.tids1,
                tidsRule,
                ruleG.maxLeft,
                maxRight);

        // if the confidence is enough
        if (confidence >= minConfidence) {
          // save the rule to the current top-k rules
          save(candidate, ruleSupport);
        }
        // register the rule as a candidate for future expansion(s)
        registerAsCandidate(false, candidate);
      }
    }
  }
  /**
   * Try to expand a rule by left and right expansions.
   *
   * @param ruleG the rule
   */
  private void expandLR(RuleG ruleG) {
    // Maps to record the potential item to expand the left/right sides of the rule
    // Key: item   Value: bitset indicating the IDs of the transaction containing the item
    // from the transactions containing the rule.
    Map<Integer, BitSet> mapCountLeft = new HashMap<Integer, BitSet>();
    Map<Integer, BitSet> mapCountRight = new HashMap<Integer, BitSet>();

    for (int tid = ruleG.common.nextSetBit(0); tid >= 0; tid = ruleG.common.nextSetBit(tid + 1)) {
      Iterator<Integer> iter = database.getTransactions().get(tid).getItems().iterator();
      while (iter.hasNext()) {
        Integer item = iter.next();
        // CAN DO THIS BECAUSE TRANSACTIONS ARE SORTED BY DESCENDING
        // ITEM IDS (see Database.Java)
        if (item < ruleG.maxLeft && item < ruleG.maxRight) { //
          break;
        }
        if (tableItemCount[item] < minsuppRelative) {
          iter.remove();
          continue;
        }
        if (item > ruleG.maxLeft
            && !ArraysAlgos.containsLEX(ruleG.getItemset2(), item, ruleG.maxRight)) {
          BitSet tidsItem = mapCountLeft.get(item);
          if (tidsItem == null) {
            tidsItem = new BitSet();
            mapCountLeft.put(item, tidsItem);
          }
          tidsItem.set(tid);
        }
        if (item > ruleG.maxRight
            && !ArraysAlgos.containsLEX(ruleG.getItemset1(), item, ruleG.maxLeft)) {
          BitSet tidsItem = mapCountRight.get(item);
          if (tidsItem == null) {
            tidsItem = new BitSet();
            mapCountRight.put(item, tidsItem);
          }
          tidsItem.set(tid);
        }
      }
    }

    // for each item c found in the previous step, we create a rule
    // I  ==> J U {c} if the support is enough
    for (Entry<Integer, BitSet> entry : mapCountRight.entrySet()) {
      BitSet tidsRule = entry.getValue();
      int ruleSupport = tidsRule.cardinality();

      // if the support is enough
      if (ruleSupport >= minsuppRelative) {
        Integer itemC = entry.getKey();

        // create new right part of rule
        Integer[] newRightItemset = new Integer[ruleG.getItemset2().length + 1];
        System.arraycopy(ruleG.getItemset2(), 0, newRightItemset, 0, ruleG.getItemset2().length);
        newRightItemset[ruleG.getItemset2().length] = itemC;

        // recompute maxRight
        int maxRight = (itemC >= ruleG.maxRight) ? itemC : ruleG.maxRight;

        // calculate the confidence of the rule
        double confidence = ((double) ruleSupport) / ruleG.tids1.cardinality();

        // create the rule
        RuleG candidate =
            new RuleG(
                ruleG.getItemset1(),
                newRightItemset,
                ruleSupport,
                ruleG.tids1,
                tidsRule,
                ruleG.maxLeft,
                maxRight);

        // if the confidence is enough
        if (confidence >= minConfidence) {
          // save the rule in current top-k rules
          save(candidate, ruleSupport);
        }
        // register the rule as a candidate for future expansion
        registerAsCandidate(false, candidate);
      }
    }

    // for each item c found in the previous step, we create a rule
    // I  U {c} ==> J if the support is enough
    for (Entry<Integer, BitSet> entry : mapCountLeft.entrySet()) {
      BitSet tidsRule = entry.getValue();
      int ruleSupport = tidsRule.cardinality();

      // if the support is enough
      if (ruleSupport >= minsuppRelative) {
        Integer itemC = entry.getKey();

        // The tidset of the left itemset is calculated
        BitSet tidsLeft = (BitSet) ruleG.tids1.clone();
        tidsLeft.and(tableItemTids[itemC]);

        // create new left part of rule
        Integer[] newLeftItemset = new Integer[ruleG.getItemset1().length + 1];
        System.arraycopy(ruleG.getItemset1(), 0, newLeftItemset, 0, ruleG.getItemset1().length);
        newLeftItemset[ruleG.getItemset1().length] = itemC;

        // recompute maxLeft
        int maxLeft = itemC >= ruleG.maxLeft ? itemC : ruleG.maxLeft;

        // calculate the confidence of the rule
        double confidence = ((double) ruleSupport) / tidsLeft.cardinality();

        // create the rule
        RuleG candidate =
            new RuleG(
                newLeftItemset,
                ruleG.getItemset2(),
                ruleSupport,
                tidsLeft,
                tidsRule,
                maxLeft,
                ruleG.maxRight);

        // if the confidence is high enough
        if (confidence >= minConfidence) {
          // save the rule to the top-k rules
          save(candidate, ruleSupport);
        }
        // register the rule as a candidate for further expansions
        registerAsCandidate(true, candidate);
      }
    }
  }