/** * Run the algorithm. * * @param minSupport Minsup as a percentage (ex: 0.05 = 5 %) * @param minConfidence minimum confidence (a value between 0 and 1). * @param input the input file path * @param output the output file path * @param windowSize a window size * @throws IOException exception if there is an error reading/writing files */ public void runAlgorithm( double minSupport, double minConfidence, String input, String output, int windowSize) throws IOException { // load the input file into memory try { this.database = new SequenceDatabase(); database.loadFile(input); } catch (Exception e) { e.printStackTrace(); } // convert minimum support to a relative minimum support (integer) this.minsuppRelative = (int) Math.ceil(minSupport * database.size()); // run the algorithm runAlgorithm(input, output, minsuppRelative, minConfidence, windowSize); }
/** * This method calculate the frequency of each item in one database pass. Then it remove all items * that are not frequent. * * @param database : a sequence database * @return A map such that key = item value = a map where a key = tid and a value = Occurence This * map allows knowing the frequency of each item and their first and last occurence in each * sequence. */ private Map<String, Map<Integer, Occurence>> removeItemsThatAreNotFrequent( SequenceDatabase database) { // (1) Count the support of each item in the database in one database pass mapItemCount = new HashMap<String, Map<Integer, Occurence>>(); // <item, Map<tid, occurence>> // for each sequence for (Sequence sequence : database.getSequences()) { // for each itemset for (short j = 0; j < sequence.getItemsets().size(); j++) { List<String> itemset = sequence.get(j); // for each item for (int i = 0; i < itemset.size(); i++) { String itemI = itemset.get(i); Map<Integer, Occurence> occurences = mapItemCount.get(itemI); if (occurences == null) { occurences = new HashMap<Integer, Occurence>(); mapItemCount.put(itemI, occurences); } Occurence occurence = occurences.get(sequence.getId()); if (occurence == null) { occurence = new Occurence(sequence.getId()); occurences.put(sequence.getId(), occurence); } occurence.add(j); } } } // System.out.println("NUMBER OF DIFFERENT ITEMS : " + mapItemCount.size()); // (2) remove all items that are not frequent from the database for (Sequence sequence : database.getSequences()) { int i = 0; while (i < sequence.getItemsets().size()) { List<String> itemset = sequence.getItemsets().get(i); int j = 0; while (j < itemset.size()) { double count = mapItemCount.get(itemset.get(j)).size(); if (count < minsuppRelative) { itemset.remove(j); } else { j++; } } i++; } } return mapItemCount; }
/** * For each item, calculate the sequence id of sequences containing that item * * @param database the current sequence database * @return Map of items to sequence IDs that contains each item */ private Map<String, Set<Integer>> findSequencesContainingItems(SequenceDatabase contexte) { // We use a map to store the sequence IDs where an item appear // Key : item Value : a set of sequence IDs Map<String, Set<Integer>> mapSequenceID = new HashMap< String, Set<Integer>>(); // pour conserver les ID des séquences: <Id Item, Set d'id de // séquences> // for each sequence in the current database for (Sequence sequence : contexte.getSequences()) { // for each itemset in this sequence for (List<String> itemset : sequence.getItemsets()) { // for each item for (String item : itemset) { // get the set of sequence IDs for this item until now Set<Integer> sequenceIDs = mapSequenceID.get(item); if (sequenceIDs == null) { // if the set does not exist, create one sequenceIDs = new HashSet<Integer>(); mapSequenceID.put(item, sequenceIDs); } // add the sequence ID of the current sequence to the // set of sequences IDs of this item sequenceIDs.add(sequence.getId()); // } } } } return mapSequenceID; }
/** * This is the main method for the PrefixSpan algorithm that is called to start the algorithm * * @param outputFilePath an output file path if the result should be saved to a file or null if * the result should be saved to memory. * @param database a sequence database * @throws IOException exception if an error while writing the output file */ private void prefixSpan(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // We have to scan the database to find all frequent patterns of size 1. // We note the sequences in which these patterns appear. Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE ITON A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 ) // Create a list of pseudosequence List<PseudoSequence> initialContext = new ArrayList<PseudoSequence>(); // for each sequence in the database for (Sequence sequence : database.getSequences()) { // remove infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if the size is > 0, create a pseudo sequence with this sequence initialContext.add(new PseudoSequence(optimizedSequence, 0, 0)); } } // For each item for (Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent (has a support >= minsup) if (entry.getValue().size() >= minsuppAbsolute) { // if the item is frequent // build the projected context String item = entry.getKey(); List<PseudoSequence> projectedContext = buildProjectedContext(item, initialContext, false); // Create the prefix for the projected context. SequentialPattern prefix = new SequentialPattern(0); prefix.addItemset(new Itemset(item)); prefix.setSequencesID(entry.getValue()); // The prefix is a frequent sequential pattern. // We save it in the result. savePattern(prefix); // we found a sequence. // Recursive call ! recursion(prefix, projectedContext); } } }
/** * This method search for items for expanding left side of a rule I --> J with any item c. This * results in rules of the form I --> J U�{c}. The method makes sure that: - c is not already * included in I or J - c appear at least minsup time in tidsIJ after the first occurence of I - c * is lexically bigger than all items in J * * @param mapWindowsJI * @throws IOException */ private void expandRight( String[] itemsetI, String[] itemsetJ, Set<Integer> tidsI, Collection<Integer> tidsJ, Collection<Integer> tidsIJ // , // Map<Integer, Occurence> occurencesI, // Map<Integer, Occurence> occurencesJ ) throws IOException { // // map-key: item map-value: set of tids containing the item Map<String, Set<Integer>> frequentItemsC = new HashMap<String, Set<Integer>>(); // for each sequence containing I-->J for (Integer tid : tidsIJ) { Sequence sequence = database.getSequences().get(tid); LinkedHashMap<String, Integer> mapMostRightFromI = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, Integer> mapMostRightFromJ = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, LinkedList<Integer>> mapMostLeftFromI = new LinkedHashMap<String, LinkedList<Integer>>(); int lastItemsetScannedForC = Integer.MIN_VALUE; // For each itemset starting from the first... int k = 0; do { final int firstElementOfWindow = k - windowSize + 1; int lastElementOfWindow = k; // remove items from I that fall outside the time window int previousISize = mapMostRightFromI.size(); removeElementOutsideWindowER(mapMostRightFromI, firstElementOfWindow); // important: if I was all there, but become smaller we need to clear the // hashmap for items of J. int currentISize = mapMostRightFromI.size(); if (previousISize == itemsetJ.length && previousISize != currentISize) { mapMostRightFromJ.clear(); } // remove items from J that fall outside the time window removeElementOutsideWindowER(mapMostRightFromJ, firstElementOfWindow); // For each item of the current itemset for (String item : sequence.get(k)) { // record the first position until now of each item in I or J if (mapMostRightFromI.size() == itemsetI.length && contains(itemsetJ, item)) { addToLinked(mapMostRightFromJ, item, k); } else if (contains(itemsetI, item)) { addToLinked(mapMostRightFromI, item, k); LinkedList<Integer> list = mapMostLeftFromI.get(item); if (list == null) { list = new LinkedList<Integer>(); addToLinked(mapMostLeftFromI, item, list); } list.add(k); } } // if all the items of IJ are in the current window if (mapMostRightFromI.size() == itemsetI.length && mapMostRightFromJ.size() == itemsetJ.length) { // remove items from mostLeft that fall outside the time window. // at the same time, calculate the minimum index for items of I. int minimum = 1; for (LinkedList<Integer> list : mapMostLeftFromI.values()) { while (true) { Integer last = list.getLast(); if (last < firstElementOfWindow) { list.removeLast(); } else { if (last > minimum) { minimum = last + 1; } break; } } } // we need to scan for items C to extend the rule... // Such item c has to appear in the window before the last occurence of J (before // "minimum") // and if it was scanned before, it should not be scanned again. int itemsetC = minimum; if (itemsetC < lastItemsetScannedForC) { itemsetC = lastItemsetScannedForC + 1; } for (; itemsetC <= lastElementOfWindow; itemsetC++) { for (String itemC : sequence.get(itemsetC)) { // if lexical order is not respected or c is included in the rule // already. if (containsLEX(itemsetI, itemC) || containsLEXPlus(itemsetJ, itemC)) { continue; } Set<Integer> tidsItemC = frequentItemsC.get(itemC); if (tidsItemC == null) { tidsItemC = new HashSet<Integer>(); frequentItemsC.put(itemC, tidsItemC); } tidsItemC.add(tid); } } lastItemsetScannedForC = lastElementOfWindow; } k++; } while (k < sequence.size() && lastItemsetScannedForC < sequence.size() - 1); } //////////////////////////////////////////////////////////////////////// // for each item c found, we create a rule for (Entry<String, Set<Integer>> entry : frequentItemsC.entrySet()) { Set<Integer> tidsI_JC = entry.getValue(); // if the support is enough Sup(R) = sup(IC -->J) if (tidsI_JC.size() >= minsuppRelative) { String itemC = entry.getKey(); String[] itemsetJC = new String[itemsetJ.length + 1]; System.arraycopy(itemsetJ, 0, itemsetJC, 0, itemsetJ.length); itemsetJC[itemsetJ.length] = itemC; // // Itemset itemsetJC = new Itemset(ruleIJ.getItemset2()); // itemsetJC.addItem(itemC); // ---- CALCULATE ALL THE TIDS CONTAINING JC WITHIN A TIME WINDOW --- Set<Integer> tidsJC = new HashSet<Integer>(); loop1: for (Integer tid : tidsJ) { Sequence sequence = database.getSequences().get(tid); // MAP: item : itemset index LinkedHashMap<String, Integer> mapAlreadySeenFromJC = new LinkedHashMap<String, Integer>(); // For each itemset for (int k = 0; k < sequence.size(); k++) { // For each item for (String item : sequence.get(k)) { if (contains(itemsetJC, item)) { // record the last position of each item in JC addToLinked(mapAlreadySeenFromJC, item, k); } } // remove items that fall outside the time window Iterator<Entry<String, Integer>> iter = mapAlreadySeenFromJC.entrySet().iterator(); while (iter.hasNext()) { Entry<String, Integer> entryMap = iter.next(); if (entryMap.getValue() < k - windowSize + 1) { iter.remove(); } else { break; } } // if all the items of I are inside the current window, then record the tid if (mapAlreadySeenFromJC.keySet().size() == itemsetJC.length) { tidsJC.add(tid); continue loop1; } } } // ---- ---- // Create rule and calculate its confidence: Conf(r) = sup(I-->JC) / sup(I) double confI_JC = ((double) tidsI_JC.size()) / tidsI.size(); // Rule ruleI_JC = new Rule(ruleIJ.getItemset1(), itemsetJC, confI_JC, tidsI_JC.size()); // if the confidence is enough if (confI_JC >= minconf) { saveRule(tidsI_JC, confI_JC, itemsetI, itemsetJC); } expandRight(itemsetI, itemsetJC, tidsI, tidsJC, tidsI_JC); // // recursive call to expand left and right side of the rule expandLeft(itemsetI, itemsetJC, tidsI, tidsI_JC); // occurencesJ } } MemoryLogger.getInstance().checkMemory(); }
/** * Run the algorithm. * * @param relativeMinSupport minsup as a a relative value (integer) * @param minConfidence minimum confidence (a value between 0 and 1). * @param input the input file path * @param output the output file path * @param windowSize a window size * @throws IOException exception if there is an error reading/writing files */ public void runAlgorithm( String input, String output, int relativeMinSupport, double minConfidence, int windowSize) throws IOException { this.minconf = minConfidence; // read the database into memory if (database == null) { try { this.database = new SequenceDatabase(); database.loadFile(input); } catch (Exception e) { e.printStackTrace(); } } // IMPORTANT : THIS IS A FIX SO THAT THE DEFINITION IS THE SAME AS IN THE ARTICLE!! this.windowSize = windowSize + 1; // if minsup is 0, set it to 1 this.minsuppRelative = relativeMinSupport; if (this.minsuppRelative == 0) { // protection this.minsuppRelative = 1; } // reset the stats for memory usage MemoryLogger.getInstance().reset(); // prepare the object for writing the output file writer = new BufferedWriter(new FileWriter(output)); // save the start time timeStart = System.currentTimeMillis(); // for stats // remove infrequent items from the database removeItemsThatAreNotFrequent(database); // note frequent items in a list "listFrequents" List<String> listFrequents = new ArrayList<String>(); // for each item for (Entry<String, Map<Integer, Occurence>> entry : mapItemCount.entrySet()) { // if it is frequent if (entry.getValue().size() >= minsuppRelative) { // add the item to the list listFrequents.add(entry.getKey()); } } // FOR EACH FREQUENT ITEM WE COMPARE WITH EACH OTHER FREQUENT ITEM TO // TRY TO GENERATE A RULE 1-1. for (int i = 0; i < listFrequents.size(); i++) { String intI = listFrequents.get(i); Map<Integer, Occurence> occurencesI = mapItemCount.get(intI); for (int j = i + 1; j < listFrequents.size(); j++) { String intJ = listFrequents.get(j); Map<Integer, Occurence> occurencesJ = mapItemCount.get(intJ); // (1) Calculate tidsI, tidsJ, tidsJ-->J and tidsI->J Set<Integer> tidsI = new HashSet<Integer>(); Set<Integer> tidsJ = null; Set<Integer> tidsIJ = new HashSet<Integer>(); Set<Integer> tidsJI = new HashSet<Integer>(); // for each occurence of I looptid: for (Occurence occI : occurencesI.values()) { // add the sequenceID to tidsI tidsI.add(occI.sequenceID); // if J does not appear in that sequence continue loop Occurence occJ = occurencesJ.get(occI.sequenceID); if (occJ == null) { continue looptid; } // make a big loop to compare if I appears before // J in that sequence and // if J appears before I boolean addedIJ = false; boolean addedJI = false; // for each occurence of I in that sequence loopIJ: for (Short posI : occI.occurences) { // for each occurence of J in that sequence for (Short posJ : occJ.occurences) { if (!posI.equals(posJ) && Math.abs(posI - posJ) <= windowSize) { if (posI <= posJ) { // if I is before J tidsIJ.add(occI.sequenceID); addedIJ = true; } else { // if J is before I tidsJI.add(occI.sequenceID); addedJI = true; } // if we have found that I is before J and J is before I // we don't need to continue. if (addedIJ && addedJI) { break loopIJ; } } } } } // END // (2) check if the two itemsets have enough common tids // if not, we don't need to generate a rule for them. // create rule IJ if (tidsIJ.size() >= minsuppRelative) { // calculate the confidence of I ==> J double confIJ = ((double) tidsIJ.size()) / occurencesI.size(); // create itemset of the rule I ==> J String[] itemset1 = new String[] {intI}; String[] itemset2 = new String[] {intJ}; // if the confidence is high enough, save the rule if (confIJ >= minConfidence) { saveRule(tidsIJ, confIJ, itemset1, itemset2); } // Calculate tidsJ. tidsJ = new HashSet<Integer>(); for (Occurence occJ : occurencesJ.values()) { tidsJ.add(occJ.sequenceID); } // recursive call to try to expand the rule expandLeft(itemset1, itemset2, tidsI, tidsIJ); expandRight(itemset1, itemset2, tidsI, tidsJ, tidsIJ); } // create rule JI if (tidsJI.size() >= minsuppRelative) { double confJI = ((double) tidsJI.size()) / occurencesJ.size(); // create itemsets for that rule String[] itemset1 = new String[] {intI}; String[] itemset2 = new String[] {intJ}; // if the rule has enough confidence, save it! if (confJI >= minConfidence) { saveRule(tidsJI, confJI, itemset2, itemset1); // rules.addRule(ruleJI); } // Calculate tidsJ. if (tidsJ == null) { tidsJ = new HashSet<Integer>(); for (Occurence occJ : occurencesJ.values()) { tidsJ.add(occJ.sequenceID); } } // recursive call to try to expand the rule expandRight(itemset2, itemset1, tidsJ, tidsI, tidsJI /*, occurencesJ, occurencesI*/); expandLeft(itemset2, itemset1, tidsJ, tidsJI /*, occurencesI*/); } } } // save the end time for the execution of the algorithm timeEnd = System.currentTimeMillis(); // for stats // close the file writer.close(); database = null; }