Exemplo n.º 1
0
  /**
   * The ApGenRules as described in p.14 of the paper by Agrawal. (see the Agrawal paper for more
   * details).
   *
   * @param k the size of the first itemset used to generate rules
   * @param m the recursive depth of the call to this method (first time 1, then 2...)
   * @param lk the itemset that is used to generate rules
   * @param Hm a set of itemsets that can be used with lk to generate rules
   * @throws IOException exception if error while writing output file
   */
  private void apGenrules(int k, int m, Itemset lk, List<int[]> Hm) throws IOException {

    // if the itemset "lk" that is used to generate rules is larger than the size of itemsets in
    // "Hm"
    if (k > m + 1) {
      // Create a list that we will be used to store itemsets for the recursive call
      List<int[]> Hm_plus_1_for_recursion = new ArrayList<int[]>();

      // generate candidates using Hm
      List<int[]> Hm_plus_1 = generateCandidateSizeK(Hm);

      // for each such candidates
      for (int[] hm_P_1 : Hm_plus_1) {

        // We subtract the candidate from the itemset "lk"
        int[] itemset_Lk_minus_hm_P_1 =
            ArraysAlgos.cloneItemSetMinusAnItemset(lk.getItems(), hm_P_1);

        // We will now calculate the support of the rule  Lk/(hm_P_1) ==> hm_P_1
        // we need it to calculate the confidence
        int support = calculateSupport(itemset_Lk_minus_hm_P_1);

        double supportAsDouble = (double) support;

        // calculate the confidence of the rule Lk/(hm_P_1) ==> hm_P_1
        double conf = lk.getAbsoluteSupport() / supportAsDouble;

        // if the confidence is not enough than we don't need to consider
        // the rule  Lk/(hm_P_1) ==> hm_P_1 anymore so we continue
        if (conf < minconf || Double.isInfinite(conf)) {
          continue;
        }

        double lift = 0;
        int supportHm_P_1 = 0;
        // if the user is using the minlift threshold, then we will need to calculate the lift of
        // the
        // rule as well and check if the lift is higher or equal to minlift.
        if (usingLift) {
          // if we want to calculate the lift, we need the support of Hm+1
          supportHm_P_1 = calculateSupport(hm_P_1);
          // calculate the lift of the rule:  Lk/(hm_P_1) ==> hm_P_1
          double term1 = ((double) lk.getAbsoluteSupport()) / databaseSize;
          double term2 = (supportAsDouble) / databaseSize;

          lift = term1 / (term2 * ((double) supportHm_P_1 / databaseSize));

          // if the lift is not enough
          if (lift < minlift) {
            continue;
          }
        }

        // The rule has passed the confidence and lift threshold requirements,
        // so we can output it
        saveRule(
            itemset_Lk_minus_hm_P_1,
            support,
            hm_P_1,
            supportHm_P_1,
            lk.getAbsoluteSupport(),
            conf,
            lift);

        // if k == m+1, then we cannot explore further rules using Lk since Lk will be too small.
        if (k != m + 1) {
          Hm_plus_1_for_recursion.add(hm_P_1);
        }
      }
      // recursive call to apGenRules to find more rules using "lk"
      apGenrules(k, m + 1, lk, Hm_plus_1_for_recursion);
    }
  }
Exemplo n.º 2
0
  /**
   * Calculate the support of an itemset by looking at the frequent patterns of the same size.
   * Because patterns are sorted by lexical order, we use a binary search. This is MUCH MORE
   * efficient than just browsing the full list of patterns. An alternative would be to use a trie
   * to store patterns but it may require a bit more memory.
   *
   * @param itemset the itemset.
   * @return the support of the itemset
   */
  private int calculateSupport(int[] itemset) {
    // We first get the list of patterns having the same size as "itemset"
    List<Itemset> patternsSameSize = patterns.getLevels().get(itemset.length);
    //
    // We perform a binary search to find the position of itemset in this list
    int first = 0;
    int last = patternsSameSize.size() - 1;

    while (first <= last) {
      int middle = (first + last) >> 1; // >>1 means to divide by 2
      int[] itemsetMiddle = patternsSameSize.get(middle).getItems();

      int comparison = ArraysAlgos.comparatorItemsetSameSize.compare(itemset, itemsetMiddle);
      if (comparison > 0) {
        first =
            middle
                + 1; //  the itemset compared is larger than the subset according to the lexical
                     // order
      } else if (comparison < 0) {
        last =
            middle
                - 1; //  the itemset compared is smaller than the subset  is smaller according to
                     // the lexical order
      } else {
        // we have found the itemset, so we return its support.
        return patternsSameSize.get(middle).getAbsoluteSupport();
      }
    }

    // If the itemset is not found in this list, it means that the itemset
    // is not closed, so we need to find the smallest superset (its closure) to
    // determine its support.
    // We start from itemset of length |itemset|+1 and increase the size of itemset after each
    // while loop.
    int size = itemset.length;
    loop:
    while (true) {
      size++;
      List<Itemset> patternsList = patterns.getLevels().get(size);
      // For each pattern of a given size
      for (Itemset pattern : patternsList) {
        int[] patternArray = pattern.getItems();

        // If the first item of the pattern is larger than the first item of the itemset,
        // we don't need to compare with following patterns.
        if (patternArray[0] > itemset[0]) {
          continue loop;
        }

        // Otherwise, we check if itemset is contained in pattern
        int posItemset = 0;
        int posPattern = 0;
        while (posPattern < patternArray.length) {
          if (patternArray[posPattern] == itemset[posItemset]) {
            posItemset++;
            // if it is contained completely
            if (posItemset == itemset.length) {
              return pattern.getAbsoluteSupport();
            }
          } else if (patternArray[posPattern] >= itemset[posItemset]) {
            // if the current item of pattern is larger than the current item in pattern,
            // then itemset cannot be contained in pattern so we stop considering it
            break;
          }
          posPattern++;
        }
      }
    }
  }
Exemplo n.º 3
0
  /**
   * Run the algorithm for generating association rules from a set of itemsets.
   *
   * @param patterns the set of itemsets
   * @param output the output file path. If null the result is saved in memory and returned by the
   *     method.
   * @param databaseSize the number of transactions in the original database
   * @return the set of rules found if the user chose to save the result to memory
   * @throws IOException exception if error while writting to file
   */
  private AssocRules runAlgorithm(Itemsets patterns, String output, int databaseSize)
      throws IOException {

    // if the user want to keep the result into memory
    if (output == null) {
      writer = null;
      rules = new AssocRules("ASSOCIATION RULES");
    } else {
      // if the user want to save the result to a file
      rules = null;
      writer = new BufferedWriter(new FileWriter(output));
    }

    this.databaseSize = databaseSize;

    // record the time when the algorithm starts
    startTimestamp = System.currentTimeMillis();
    // initialize variable to count the number of rules found
    ruleCount = 0;
    // save itemsets in a member variable
    this.patterns = patterns;

    // SORTING
    // First, we sort all itemsets having the same size by lexical order
    // We do this for optimization purposes. If the itemsets are sorted, it allows to
    // perform two optimizations:
    // 1) When we need to calculate the support of an itemset (in the method
    // "calculateSupport()") we can use a binary search instead of browsing the whole list.
    // 2) When combining itemsets to generate candidate, we can use the
    //    lexical order to avoid comparisons (in the method "generateCandidates()").

    // For itemsets of the same size
    for (List<Itemset> itemsetsSameSize : patterns.getLevels()) {
      // Sort by lexicographical order using a Comparator
      Collections.sort(
          itemsetsSameSize,
          new Comparator<Itemset>() {
            @Override
            public int compare(Itemset o1, Itemset o2) {
              // The following code assume that itemsets are the same size
              return ArraysAlgos.comparatorItemsetSameSize.compare(o1.getItems(), o2.getItems());
            }
          });
    }
    // END OF SORTING

    // Now we will generate the rules.

    // For each frequent itemset of size >=2 that we will name "lk"
    for (int k = 2; k < patterns.getLevels().size(); k++) {
      for (Itemset lk : patterns.getLevels().get(k)) {

        // create a variable H1 for recursion
        List<int[]> H1_for_recursion = new ArrayList<int[]>();

        // For each itemset "itemsetSize1" of size 1 that is member of lk
        for (int item : lk.getItems()) {
          int itemsetHm_P_1[] = new int[] {item};

          // make a copy of  lk without items from  hm_P_1
          int[] itemset_Lk_minus_hm_P_1 = ArraysAlgos.cloneItemSetMinusOneItem(lk.getItems(), item);

          // Now we will calculate the support and confidence
          // of the rule: itemset_Lk_minus_hm_P_1 ==>  hm_P_1
          int support = calculateSupport(itemset_Lk_minus_hm_P_1); // THIS COULD BE
          // OPTIMIZED ?
          double supportAsDouble = (double) support;

          // calculate the confidence of the rule : itemset_Lk_minus_hm_P_1 ==>  hm_P_1
          double conf = lk.getAbsoluteSupport() / supportAsDouble;

          // if the confidence is lower than minconf
          if (conf < minconf || Double.isInfinite(conf)) {
            continue;
          }

          double lift = 0;
          int supportHm_P_1 = 0;
          // if the user is using the minlift threshold, we will need
          // to also calculate the lift of the rule:  itemset_Lk_minus_hm_P_1 ==>  hm_P_1
          if (usingLift) {
            // if we want to calculate the lift, we need the support of hm_P_1
            supportHm_P_1 =
                calculateSupport(
                    itemsetHm_P_1); // if we want to calculate the lift, we need to add this.
            // calculate the lift
            double term1 = ((double) lk.getAbsoluteSupport()) / databaseSize;
            double term2 = supportAsDouble / databaseSize;
            double term3 = ((double) supportHm_P_1 / databaseSize);
            lift = term1 / (term2 * term3);

            // if the lift is not enough
            if (lift < minlift) {
              continue;
            }
          }

          // If we are here, it means that the rule respect the minconf and minlift parameters.
          // Therefore, we output the rule.
          saveRule(
              itemset_Lk_minus_hm_P_1,
              support,
              itemsetHm_P_1,
              supportHm_P_1,
              lk.getAbsoluteSupport(),
              conf,
              lift);

          // Then we keep the itemset  hm_P_1 to find more rules using this itemset and lk.
          H1_for_recursion.add(itemsetHm_P_1);
          // ================ END OF WHAT I HAVE ADDED
        }

        // Finally, we make a recursive call to continue explores rules that can be made with "lk"
        apGenrules(k, 1, lk, H1_for_recursion);
      }
    }

    // close the file if we saved the result to a file
    if (writer != null) {
      writer.close();
    }
    // record the end time of the algorithm execution
    endTimeStamp = System.currentTimeMillis();

    // Return the rules found if the user chose to save the result to memory rather than a file.
    // Otherwise, null will be returned
    return rules;
  }