/** * This method calculate the frequency of each item in one database pass. Then it remove all items * that are not frequent. * * @param database : a sequence database * @return A map such that key = item value = a map where a key = tid and a value = Occurence This * map allows knowing the frequency of each item and their first and last occurence in each * sequence. */ private Map<String, Map<Integer, Occurence>> removeItemsThatAreNotFrequent( SequenceDatabase database) { // (1) Count the support of each item in the database in one database pass mapItemCount = new HashMap<String, Map<Integer, Occurence>>(); // <item, Map<tid, occurence>> // for each sequence for (Sequence sequence : database.getSequences()) { // for each itemset for (short j = 0; j < sequence.getItemsets().size(); j++) { List<String> itemset = sequence.get(j); // for each item for (int i = 0; i < itemset.size(); i++) { String itemI = itemset.get(i); Map<Integer, Occurence> occurences = mapItemCount.get(itemI); if (occurences == null) { occurences = new HashMap<Integer, Occurence>(); mapItemCount.put(itemI, occurences); } Occurence occurence = occurences.get(sequence.getId()); if (occurence == null) { occurence = new Occurence(sequence.getId()); occurences.put(sequence.getId(), occurence); } occurence.add(j); } } } // System.out.println("NUMBER OF DIFFERENT ITEMS : " + mapItemCount.size()); // (2) remove all items that are not frequent from the database for (Sequence sequence : database.getSequences()) { int i = 0; while (i < sequence.getItemsets().size()) { List<String> itemset = sequence.getItemsets().get(i); int j = 0; while (j < itemset.size()) { double count = mapItemCount.get(itemset.get(j)).size(); if (count < minsuppRelative) { itemset.remove(j); } else { j++; } } i++; } } return mapItemCount; }
/** * For each item, calculate the sequence id of sequences containing that item * * @param database the current sequence database * @return Map of items to sequence IDs that contains each item */ private Map<String, Set<Integer>> findSequencesContainingItems(SequenceDatabase contexte) { // We use a map to store the sequence IDs where an item appear // Key : item Value : a set of sequence IDs Map<String, Set<Integer>> mapSequenceID = new HashMap< String, Set<Integer>>(); // pour conserver les ID des séquences: <Id Item, Set d'id de // séquences> // for each sequence in the current database for (Sequence sequence : contexte.getSequences()) { // for each itemset in this sequence for (List<String> itemset : sequence.getItemsets()) { // for each item for (String item : itemset) { // get the set of sequence IDs for this item until now Set<Integer> sequenceIDs = mapSequenceID.get(item); if (sequenceIDs == null) { // if the set does not exist, create one sequenceIDs = new HashSet<Integer>(); mapSequenceID.put(item, sequenceIDs); } // add the sequence ID of the current sequence to the // set of sequences IDs of this item sequenceIDs.add(sequence.getId()); // } } } } return mapSequenceID; }
/** * This is the main method for the PrefixSpan algorithm that is called to start the algorithm * * @param outputFilePath an output file path if the result should be saved to a file or null if * the result should be saved to memory. * @param database a sequence database * @throws IOException exception if an error while writing the output file */ private void prefixSpan(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // We have to scan the database to find all frequent patterns of size 1. // We note the sequences in which these patterns appear. Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE ITON A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 ) // Create a list of pseudosequence List<PseudoSequence> initialContext = new ArrayList<PseudoSequence>(); // for each sequence in the database for (Sequence sequence : database.getSequences()) { // remove infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if the size is > 0, create a pseudo sequence with this sequence initialContext.add(new PseudoSequence(optimizedSequence, 0, 0)); } } // For each item for (Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent (has a support >= minsup) if (entry.getValue().size() >= minsuppAbsolute) { // if the item is frequent // build the projected context String item = entry.getKey(); List<PseudoSequence> projectedContext = buildProjectedContext(item, initialContext, false); // Create the prefix for the projected context. SequentialPattern prefix = new SequentialPattern(0); prefix.addItemset(new Itemset(item)); prefix.setSequencesID(entry.getValue()); // The prefix is a frequent sequential pattern. // We save it in the result. savePattern(prefix); // we found a sequence. // Recursive call ! recursion(prefix, projectedContext); } } }
/** * This method search for items for expanding left side of a rule I --> J with any item c. This * results in rules of the form I --> J U�{c}. The method makes sure that: - c is not already * included in I or J - c appear at least minsup time in tidsIJ after the first occurence of I - c * is lexically bigger than all items in J * * @param mapWindowsJI * @throws IOException */ private void expandRight( String[] itemsetI, String[] itemsetJ, Set<Integer> tidsI, Collection<Integer> tidsJ, Collection<Integer> tidsIJ // , // Map<Integer, Occurence> occurencesI, // Map<Integer, Occurence> occurencesJ ) throws IOException { // // map-key: item map-value: set of tids containing the item Map<String, Set<Integer>> frequentItemsC = new HashMap<String, Set<Integer>>(); // for each sequence containing I-->J for (Integer tid : tidsIJ) { Sequence sequence = database.getSequences().get(tid); LinkedHashMap<String, Integer> mapMostRightFromI = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, Integer> mapMostRightFromJ = new LinkedHashMap<String, Integer>(); LinkedHashMap<String, LinkedList<Integer>> mapMostLeftFromI = new LinkedHashMap<String, LinkedList<Integer>>(); int lastItemsetScannedForC = Integer.MIN_VALUE; // For each itemset starting from the first... int k = 0; do { final int firstElementOfWindow = k - windowSize + 1; int lastElementOfWindow = k; // remove items from I that fall outside the time window int previousISize = mapMostRightFromI.size(); removeElementOutsideWindowER(mapMostRightFromI, firstElementOfWindow); // important: if I was all there, but become smaller we need to clear the // hashmap for items of J. int currentISize = mapMostRightFromI.size(); if (previousISize == itemsetJ.length && previousISize != currentISize) { mapMostRightFromJ.clear(); } // remove items from J that fall outside the time window removeElementOutsideWindowER(mapMostRightFromJ, firstElementOfWindow); // For each item of the current itemset for (String item : sequence.get(k)) { // record the first position until now of each item in I or J if (mapMostRightFromI.size() == itemsetI.length && contains(itemsetJ, item)) { addToLinked(mapMostRightFromJ, item, k); } else if (contains(itemsetI, item)) { addToLinked(mapMostRightFromI, item, k); LinkedList<Integer> list = mapMostLeftFromI.get(item); if (list == null) { list = new LinkedList<Integer>(); addToLinked(mapMostLeftFromI, item, list); } list.add(k); } } // if all the items of IJ are in the current window if (mapMostRightFromI.size() == itemsetI.length && mapMostRightFromJ.size() == itemsetJ.length) { // remove items from mostLeft that fall outside the time window. // at the same time, calculate the minimum index for items of I. int minimum = 1; for (LinkedList<Integer> list : mapMostLeftFromI.values()) { while (true) { Integer last = list.getLast(); if (last < firstElementOfWindow) { list.removeLast(); } else { if (last > minimum) { minimum = last + 1; } break; } } } // we need to scan for items C to extend the rule... // Such item c has to appear in the window before the last occurence of J (before // "minimum") // and if it was scanned before, it should not be scanned again. int itemsetC = minimum; if (itemsetC < lastItemsetScannedForC) { itemsetC = lastItemsetScannedForC + 1; } for (; itemsetC <= lastElementOfWindow; itemsetC++) { for (String itemC : sequence.get(itemsetC)) { // if lexical order is not respected or c is included in the rule // already. if (containsLEX(itemsetI, itemC) || containsLEXPlus(itemsetJ, itemC)) { continue; } Set<Integer> tidsItemC = frequentItemsC.get(itemC); if (tidsItemC == null) { tidsItemC = new HashSet<Integer>(); frequentItemsC.put(itemC, tidsItemC); } tidsItemC.add(tid); } } lastItemsetScannedForC = lastElementOfWindow; } k++; } while (k < sequence.size() && lastItemsetScannedForC < sequence.size() - 1); } //////////////////////////////////////////////////////////////////////// // for each item c found, we create a rule for (Entry<String, Set<Integer>> entry : frequentItemsC.entrySet()) { Set<Integer> tidsI_JC = entry.getValue(); // if the support is enough Sup(R) = sup(IC -->J) if (tidsI_JC.size() >= minsuppRelative) { String itemC = entry.getKey(); String[] itemsetJC = new String[itemsetJ.length + 1]; System.arraycopy(itemsetJ, 0, itemsetJC, 0, itemsetJ.length); itemsetJC[itemsetJ.length] = itemC; // // Itemset itemsetJC = new Itemset(ruleIJ.getItemset2()); // itemsetJC.addItem(itemC); // ---- CALCULATE ALL THE TIDS CONTAINING JC WITHIN A TIME WINDOW --- Set<Integer> tidsJC = new HashSet<Integer>(); loop1: for (Integer tid : tidsJ) { Sequence sequence = database.getSequences().get(tid); // MAP: item : itemset index LinkedHashMap<String, Integer> mapAlreadySeenFromJC = new LinkedHashMap<String, Integer>(); // For each itemset for (int k = 0; k < sequence.size(); k++) { // For each item for (String item : sequence.get(k)) { if (contains(itemsetJC, item)) { // record the last position of each item in JC addToLinked(mapAlreadySeenFromJC, item, k); } } // remove items that fall outside the time window Iterator<Entry<String, Integer>> iter = mapAlreadySeenFromJC.entrySet().iterator(); while (iter.hasNext()) { Entry<String, Integer> entryMap = iter.next(); if (entryMap.getValue() < k - windowSize + 1) { iter.remove(); } else { break; } } // if all the items of I are inside the current window, then record the tid if (mapAlreadySeenFromJC.keySet().size() == itemsetJC.length) { tidsJC.add(tid); continue loop1; } } } // ---- ---- // Create rule and calculate its confidence: Conf(r) = sup(I-->JC) / sup(I) double confI_JC = ((double) tidsI_JC.size()) / tidsI.size(); // Rule ruleI_JC = new Rule(ruleIJ.getItemset1(), itemsetJC, confI_JC, tidsI_JC.size()); // if the confidence is enough if (confI_JC >= minconf) { saveRule(tidsI_JC, confI_JC, itemsetI, itemsetJC); } expandRight(itemsetI, itemsetJC, tidsI, tidsJC, tidsI_JC); // // recursive call to expand left and right side of the rule expandLeft(itemsetI, itemsetJC, tidsI, tidsI_JC); // occurencesJ } } MemoryLogger.getInstance().checkMemory(); }