コード例 #1
0
 /**
  * Run the algorithm.
  *
  * @param minSupport Minsup as a percentage (ex: 0.05 = 5 %)
  * @param minConfidence minimum confidence (a value between 0 and 1).
  * @param input the input file path
  * @param output the output file path
  * @param windowSize a window size
  * @throws IOException exception if there is an error reading/writing files
  */
 public void runAlgorithm(
     double minSupport, double minConfidence, String input, String output, int windowSize)
     throws IOException {
   // load the input file into memory
   try {
     this.database = new SequenceDatabase();
     database.loadFile(input);
   } catch (Exception e) {
     e.printStackTrace();
   }
   // convert minimum support to a relative minimum support (integer)
   this.minsuppRelative = (int) Math.ceil(minSupport * database.size());
   // run the algorithm
   runAlgorithm(input, output, minsuppRelative, minConfidence, windowSize);
 }
コード例 #2
0
  /**
   * This method calculate the frequency of each item in one database pass. Then it remove all items
   * that are not frequent.
   *
   * @param database : a sequence database
   * @return A map such that key = item value = a map where a key = tid and a value = Occurence This
   *     map allows knowing the frequency of each item and their first and last occurence in each
   *     sequence.
   */
  private Map<String, Map<Integer, Occurence>> removeItemsThatAreNotFrequent(
      SequenceDatabase database) {
    // (1) Count the support of each item in the database in one database pass
    mapItemCount = new HashMap<String, Map<Integer, Occurence>>(); // <item, Map<tid, occurence>>

    // for each sequence
    for (Sequence sequence : database.getSequences()) {
      // for each itemset
      for (short j = 0; j < sequence.getItemsets().size(); j++) {
        List<String> itemset = sequence.get(j);
        // for each item
        for (int i = 0; i < itemset.size(); i++) {
          String itemI = itemset.get(i);
          Map<Integer, Occurence> occurences = mapItemCount.get(itemI);
          if (occurences == null) {
            occurences = new HashMap<Integer, Occurence>();
            mapItemCount.put(itemI, occurences);
          }
          Occurence occurence = occurences.get(sequence.getId());
          if (occurence == null) {
            occurence = new Occurence(sequence.getId());
            occurences.put(sequence.getId(), occurence);
          }
          occurence.add(j);
        }
      }
    }
    //		System.out.println("NUMBER OF DIFFERENT ITEMS : " + mapItemCount.size());
    // (2) remove all items that are not frequent from the database
    for (Sequence sequence : database.getSequences()) {
      int i = 0;
      while (i < sequence.getItemsets().size()) {
        List<String> itemset = sequence.getItemsets().get(i);
        int j = 0;
        while (j < itemset.size()) {
          double count = mapItemCount.get(itemset.get(j)).size();

          if (count < minsuppRelative) {
            itemset.remove(j);
          } else {
            j++;
          }
        }
        i++;
      }
    }
    return mapItemCount;
  }
コード例 #3
0
 /**
  * For each item, calculate the sequence id of sequences containing that item
  *
  * @param database the current sequence database
  * @return Map of items to sequence IDs that contains each item
  */
 private Map<String, Set<Integer>> findSequencesContainingItems(SequenceDatabase contexte) {
   // We use a map to store the sequence IDs where an item appear
   // Key : item   Value :  a set of sequence IDs
   Map<String, Set<Integer>> mapSequenceID =
       new HashMap<
           String, Set<Integer>>(); // pour conserver les ID des séquences: <Id Item, Set d'id de
   // séquences>
   // for each sequence in the current database
   for (Sequence sequence : contexte.getSequences()) {
     // for each itemset in this sequence
     for (List<String> itemset : sequence.getItemsets()) {
       // for each item
       for (String item : itemset) {
         // get the set of sequence IDs for this item until now
         Set<Integer> sequenceIDs = mapSequenceID.get(item);
         if (sequenceIDs == null) {
           // if the set does not exist, create one
           sequenceIDs = new HashSet<Integer>();
           mapSequenceID.put(item, sequenceIDs);
         }
         // add the sequence ID of the current sequence to the
         // set of sequences IDs of this item
         sequenceIDs.add(sequence.getId());
         //					}
       }
     }
   }
   return mapSequenceID;
 }
コード例 #4
0
  /**
   * This is the main method for the PrefixSpan algorithm that is called to start the algorithm
   *
   * @param outputFilePath an output file path if the result should be saved to a file or null if
   *     the result should be saved to memory.
   * @param database a sequence database
   * @throws IOException exception if an error while writing the output file
   */
  private void prefixSpan(SequenceDatabase database, String outputFilePath) throws IOException {
    // if the user want to keep the result into memory
    if (outputFilePath == null) {
      writer = null;
      patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS");
    } else { // if the user want to save the result to a file
      patterns = null;
      writer = new BufferedWriter(new FileWriter(outputFilePath));
    }

    // We have to scan the database to find all frequent patterns of size 1.
    // We note the sequences in which these patterns appear.
    Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database);

    // WE CONVERT THE DATABASE ITON A PSEUDO-DATABASE, AND REMOVE
    // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM
    // WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 )

    // Create a list of pseudosequence
    List<PseudoSequence> initialContext = new ArrayList<PseudoSequence>();
    // for each sequence in  the database
    for (Sequence sequence : database.getSequences()) {
      // remove infrequent items
      Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute);
      if (optimizedSequence.size() != 0) {
        // if the size is > 0, create a pseudo sequence with this sequence
        initialContext.add(new PseudoSequence(optimizedSequence, 0, 0));
      }
    }

    // For each item
    for (Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()) {
      // if the item is frequent  (has a support >= minsup)
      if (entry.getValue().size() >= minsuppAbsolute) { // if the item is frequent
        // build the projected context
        String item = entry.getKey();
        List<PseudoSequence> projectedContext = buildProjectedContext(item, initialContext, false);

        // Create the prefix for the projected context.
        SequentialPattern prefix = new SequentialPattern(0);
        prefix.addItemset(new Itemset(item));
        prefix.setSequencesID(entry.getValue());

        // The prefix is a frequent sequential pattern.
        // We save it in the result.
        savePattern(prefix); // we found a sequence.

        // Recursive call !
        recursion(prefix, projectedContext);
      }
    }
  }
コード例 #5
0
  /**
   * This method search for items for expanding left side of a rule I --> J with any item c. This
   * results in rules of the form I --> J U�{c}. The method makes sure that: - c is not already
   * included in I or J - c appear at least minsup time in tidsIJ after the first occurence of I - c
   * is lexically bigger than all items in J
   *
   * @param mapWindowsJI
   * @throws IOException
   */
  private void expandRight(
      String[] itemsetI,
      String[] itemsetJ,
      Set<Integer> tidsI,
      Collection<Integer> tidsJ,
      Collection<Integer> tidsIJ // ,
      //    						Map<Integer, Occurence> occurencesI,
      //    						Map<Integer, Occurence> occurencesJ
      ) throws IOException {

    //    	// map-key: item   map-value: set of tids containing the item
    Map<String, Set<Integer>> frequentItemsC = new HashMap<String, Set<Integer>>();

    // for each sequence containing I-->J
    for (Integer tid : tidsIJ) {
      Sequence sequence = database.getSequences().get(tid);

      LinkedHashMap<String, Integer> mapMostRightFromI = new LinkedHashMap<String, Integer>();
      LinkedHashMap<String, Integer> mapMostRightFromJ = new LinkedHashMap<String, Integer>();
      LinkedHashMap<String, LinkedList<Integer>> mapMostLeftFromI =
          new LinkedHashMap<String, LinkedList<Integer>>();

      int lastItemsetScannedForC = Integer.MIN_VALUE;

      // For each itemset starting from the first...
      int k = 0;
      do {
        final int firstElementOfWindow = k - windowSize + 1;
        int lastElementOfWindow = k;

        // remove items from I that fall outside the time window
        int previousISize = mapMostRightFromI.size();
        removeElementOutsideWindowER(mapMostRightFromI, firstElementOfWindow);
        // important: if I was all there, but become smaller we need to clear the
        // hashmap for items of J.
        int currentISize = mapMostRightFromI.size();
        if (previousISize == itemsetJ.length && previousISize != currentISize) {
          mapMostRightFromJ.clear();
        }

        // remove items from J that fall outside the time window
        removeElementOutsideWindowER(mapMostRightFromJ, firstElementOfWindow);

        // For each item of the current itemset
        for (String item : sequence.get(k)) {
          // record the first position until now of each item in I or J
          if (mapMostRightFromI.size() == itemsetI.length && contains(itemsetJ, item)) {
            addToLinked(mapMostRightFromJ, item, k);
          } else if (contains(itemsetI, item)) {
            addToLinked(mapMostRightFromI, item, k);
            LinkedList<Integer> list = mapMostLeftFromI.get(item);
            if (list == null) {
              list = new LinkedList<Integer>();
              addToLinked(mapMostLeftFromI, item, list);
            }
            list.add(k);
          }
        }

        // if all the items of IJ are in the current window
        if (mapMostRightFromI.size() == itemsetI.length
            && mapMostRightFromJ.size() == itemsetJ.length) {

          // remove items from mostLeft that fall outside the time window.
          // at the same time, calculate the minimum index for items of I.
          int minimum = 1;
          for (LinkedList<Integer> list : mapMostLeftFromI.values()) {
            while (true) {
              Integer last = list.getLast();
              if (last < firstElementOfWindow) {
                list.removeLast();
              } else {
                if (last > minimum) {
                  minimum = last + 1;
                }
                break;
              }
            }
          }

          // we need to scan for items C to extend the rule...
          // Such item c has to appear in the window before the last occurence of J (before
          // "minimum")
          // and if it was scanned before, it should not be scanned again.
          int itemsetC = minimum;
          if (itemsetC < lastItemsetScannedForC) {
            itemsetC = lastItemsetScannedForC + 1;
          }

          for (; itemsetC <= lastElementOfWindow; itemsetC++) {
            for (String itemC : sequence.get(itemsetC)) {
              //    	    						if lexical order is not respected or c is included in the rule
              // already.
              if (containsLEX(itemsetI, itemC) || containsLEXPlus(itemsetJ, itemC)) {
                continue;
              }
              Set<Integer> tidsItemC = frequentItemsC.get(itemC);
              if (tidsItemC == null) {
                tidsItemC = new HashSet<Integer>();
                frequentItemsC.put(itemC, tidsItemC);
              }
              tidsItemC.add(tid);
            }
          }
          lastItemsetScannedForC = lastElementOfWindow;
        }
        k++;
      } while (k < sequence.size() && lastItemsetScannedForC < sequence.size() - 1);
    }

    ////////////////////////////////////////////////////////////////////////
    // for each item c found, we create a rule
    for (Entry<String, Set<Integer>> entry : frequentItemsC.entrySet()) {
      Set<Integer> tidsI_JC = entry.getValue();

      // if the support is enough      Sup(R)  =  sup(IC -->J)
      if (tidsI_JC.size() >= minsuppRelative) {
        String itemC = entry.getKey();
        String[] itemsetJC = new String[itemsetJ.length + 1];
        System.arraycopy(itemsetJ, 0, itemsetJC, 0, itemsetJ.length);
        itemsetJC[itemsetJ.length] = itemC;
        //
        //     			Itemset itemsetJC = new Itemset(ruleIJ.getItemset2());
        // 				itemsetJC.addItem(itemC);

        // ---- CALCULATE ALL THE TIDS CONTAINING JC WITHIN A TIME WINDOW ---
        Set<Integer> tidsJC = new HashSet<Integer>();
        loop1:
        for (Integer tid : tidsJ) {
          Sequence sequence = database.getSequences().get(tid);
          // MAP: item : itemset index
          LinkedHashMap<String, Integer> mapAlreadySeenFromJC =
              new LinkedHashMap<String, Integer>();

          // For each itemset
          for (int k = 0; k < sequence.size(); k++) {
            // For each item
            for (String item : sequence.get(k)) {
              if (contains(itemsetJC, item)) { // record the last position of each item in JC
                addToLinked(mapAlreadySeenFromJC, item, k);
              }
            }
            // remove items that fall outside the time window
            Iterator<Entry<String, Integer>> iter = mapAlreadySeenFromJC.entrySet().iterator();
            while (iter.hasNext()) {
              Entry<String, Integer> entryMap = iter.next();
              if (entryMap.getValue() < k - windowSize + 1) {
                iter.remove();
              } else {
                break;
              }
            }
            // if all the items of I are inside the current window, then record the tid
            if (mapAlreadySeenFromJC.keySet().size() == itemsetJC.length) {
              tidsJC.add(tid);
              continue loop1;
            }
          }
        }
        // ----  ----

        // Create rule and calculate its confidence:  Conf(r) = sup(I-->JC) /  sup(I)
        double confI_JC = ((double) tidsI_JC.size()) / tidsI.size();
        //				Rule ruleI_JC = new Rule(ruleIJ.getItemset1(), itemsetJC, confI_JC, tidsI_JC.size());

        // if the confidence is enough
        if (confI_JC >= minconf) {
          saveRule(tidsI_JC, confI_JC, itemsetI, itemsetJC);
        }

        expandRight(itemsetI, itemsetJC, tidsI, tidsJC, tidsI_JC); //

        // recursive call to expand left and right side of the rule
        expandLeft(itemsetI, itemsetJC, tidsI, tidsI_JC); // occurencesJ
      }
    }
    MemoryLogger.getInstance().checkMemory();
  }
コード例 #6
0
  /**
   * Run the algorithm.
   *
   * @param relativeMinSupport minsup as a a relative value (integer)
   * @param minConfidence minimum confidence (a value between 0 and 1).
   * @param input the input file path
   * @param output the output file path
   * @param windowSize a window size
   * @throws IOException exception if there is an error reading/writing files
   */
  public void runAlgorithm(
      String input, String output, int relativeMinSupport, double minConfidence, int windowSize)
      throws IOException {
    this.minconf = minConfidence;

    // read the database into memory
    if (database == null) {
      try {
        this.database = new SequenceDatabase();
        database.loadFile(input);
      } catch (Exception e) {
        e.printStackTrace();
      }
    }

    // IMPORTANT : THIS IS A FIX SO THAT THE DEFINITION IS THE SAME AS IN THE ARTICLE!!
    this.windowSize = windowSize + 1;

    // if minsup is 0, set it to 1
    this.minsuppRelative = relativeMinSupport;
    if (this.minsuppRelative == 0) { // protection
      this.minsuppRelative = 1;
    }

    // reset the stats for memory usage
    MemoryLogger.getInstance().reset();
    // prepare the object for writing the output file
    writer = new BufferedWriter(new FileWriter(output));

    // save the start time
    timeStart = System.currentTimeMillis(); // for stats

    // remove infrequent items from the database
    removeItemsThatAreNotFrequent(database);

    // note frequent items in a list "listFrequents"
    List<String> listFrequents = new ArrayList<String>();
    // for each item
    for (Entry<String, Map<Integer, Occurence>> entry : mapItemCount.entrySet()) {
      // if it is frequent
      if (entry.getValue().size() >= minsuppRelative) {
        // add the item to the list
        listFrequents.add(entry.getKey());
      }
    }

    // FOR EACH FREQUENT ITEM WE COMPARE WITH EACH OTHER FREQUENT ITEM TO
    // TRY TO GENERATE A RULE 1-1.
    for (int i = 0; i < listFrequents.size(); i++) {
      String intI = listFrequents.get(i);
      Map<Integer, Occurence> occurencesI = mapItemCount.get(intI);
      for (int j = i + 1; j < listFrequents.size(); j++) {
        String intJ = listFrequents.get(j);
        Map<Integer, Occurence> occurencesJ = mapItemCount.get(intJ);

        // (1) Calculate tidsI, tidsJ, tidsJ-->J  and tidsI->J
        Set<Integer> tidsI = new HashSet<Integer>();
        Set<Integer> tidsJ = null;
        Set<Integer> tidsIJ = new HashSet<Integer>();
        Set<Integer> tidsJI = new HashSet<Integer>();

        // for each occurence of I
        looptid:
        for (Occurence occI : occurencesI.values()) {
          // add the sequenceID to tidsI
          tidsI.add(occI.sequenceID);

          // if J does not appear in that sequence continue loop
          Occurence occJ = occurencesJ.get(occI.sequenceID);
          if (occJ == null) {
            continue looptid;
          }

          // make a big loop to compare if I appears before
          // J in that sequence and
          // if J appears before I
          boolean addedIJ = false;
          boolean addedJI = false;
          // for each occurence of I in that sequence
          loopIJ:
          for (Short posI : occI.occurences) {
            // for each occurence of J in that sequence
            for (Short posJ : occJ.occurences) {
              if (!posI.equals(posJ) && Math.abs(posI - posJ) <= windowSize) {
                if (posI <= posJ) {
                  // if I is before J
                  tidsIJ.add(occI.sequenceID);
                  addedIJ = true;
                } else {
                  // if J is before I
                  tidsJI.add(occI.sequenceID);
                  addedJI = true;
                }
                // if we have found that I is before J and J is before I
                // we don't need to continue.
                if (addedIJ && addedJI) {
                  break loopIJ;
                }
              }
            }
          }
        }
        // END

        // (2) check if the two itemsets have enough common tids
        // if not, we don't need to generate a rule for them.
        // create rule IJ
        if (tidsIJ.size() >= minsuppRelative) {
          // calculate the confidence of I ==> J
          double confIJ = ((double) tidsIJ.size()) / occurencesI.size();

          // create itemset of the rule I ==> J
          String[] itemset1 = new String[] {intI};
          String[] itemset2 = new String[] {intJ};

          // if the confidence is high enough, save the rule
          if (confIJ >= minConfidence) {
            saveRule(tidsIJ, confIJ, itemset1, itemset2);
          }
          // Calculate tidsJ.
          tidsJ = new HashSet<Integer>();
          for (Occurence occJ : occurencesJ.values()) {
            tidsJ.add(occJ.sequenceID);
          }

          // recursive call to try to expand the rule
          expandLeft(itemset1, itemset2, tidsI, tidsIJ);
          expandRight(itemset1, itemset2, tidsI, tidsJ, tidsIJ);
        }

        // create rule JI
        if (tidsJI.size() >= minsuppRelative) {
          double confJI = ((double) tidsJI.size()) / occurencesJ.size();

          // create itemsets for that rule
          String[] itemset1 = new String[] {intI};
          String[] itemset2 = new String[] {intJ};

          // if the rule has enough confidence, save it!
          if (confJI >= minConfidence) {
            saveRule(tidsJI, confJI, itemset2, itemset1);
            //							rules.addRule(ruleJI);
          }

          // Calculate tidsJ.
          if (tidsJ == null) {
            tidsJ = new HashSet<Integer>();
            for (Occurence occJ : occurencesJ.values()) {
              tidsJ.add(occJ.sequenceID);
            }
          }
          // recursive call to try to expand the rule
          expandRight(itemset2, itemset1, tidsJ, tidsI, tidsJI /*, occurencesJ, occurencesI*/);
          expandLeft(itemset2, itemset1, tidsJ, tidsJI /*, occurencesI*/);
        }
      }
    }
    // save the end time for the execution of the algorithm
    timeEnd = System.currentTimeMillis(); // for stats

    // close the file
    writer.close();
    database = null;
  }