/** * Method to recursively grow a given sequential pattern. * * @param prefix the current sequential pattern that we want to try to grow * @param database the current projected sequence database * @throws IOException exception if there is an error writing to the output file */ private void recursion(SequentialPattern prefix, List<PseudoSequence> database) throws IOException { // find frequent items of size 1 in the current projected database. Set<Pair> pairs = findAllFrequentPairs(prefix, database); // For each pair found (a pair is an item with a boolean indicating if it // appears in an itemset that is cut (a postfix) or not, and the sequence IDs // where it appears in the projected database). for (Pair pair : pairs) { // if the item is frequent in the current projected database if (pair.getCount() >= minsuppAbsolute) { // create the new postfix by appending this item to the prefix SequentialPattern newPrefix; // if the item is part of a postfix if (pair.isPostfix()) { // we append it to the last itemset of the prefix newPrefix = appendItemToPrefixOfSequence(prefix, pair.getItem()); } else { // else, we append it as a new itemset to the sequence newPrefix = appendItemToSequence(prefix, pair.getItem()); } // build the projected database with this item List<PseudoSequence> projectedDB = buildProjectedContext(pair.getItem(), database, pair.isPostfix()); newPrefix.setSequencesID(pair.getSequencesID()); // save the pattern savePattern(newPrefix); // make a recursive call recursion(newPrefix, projectedDB); } } MemoryLogger.getInstance().checkMemory(); }
/** * This method creates a copy of the sequence and add a given item to the last itemset of the * sequence. It sets the support of the sequence as the support of the item. * * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToPrefixOfSequence(SequentialPattern prefix, Integer item) { SequentialPattern newPrefix = prefix.cloneSequence(); // add to the last itemset Itemset itemset = newPrefix.get(newPrefix.size() - 1); itemset.addItem(item); return newPrefix; }
/** * This method saves a sequential pattern to the output file or in memory, depending on if the * user provided an output file path or not when he launched the algorithm * * @param prefix the pattern to be saved. * @throws IOException exception if error while writing the output file. */ private void savePattern(SequentialPattern prefix) throws IOException { // increase the number of pattern found for statistics purposes patternCount++; // if the result should be saved to a file if (writer != null) { StringBuffer r = new StringBuffer(""); for (Itemset itemset : prefix.getItemsets()) { // r.append('('); for (String item : itemset.getItems()) { String string = item.toString(); r.append(string); r.append(' '); } r.append("-1 "); } // // // print the list of Pattern IDs that contains this pattern. // if(prefix.getSequencesID() != null){ // r.append("SID: "); // for(Integer id : prefix.getSequencesID()){ // r.append(id); // r.append(' '); // } // } r.append(" #SUP: "); r.append(prefix.getSequencesID().size()); writer.write(r.toString()); writer.newLine(); } // otherwise the result is kept into memory else { patterns.addSequence(prefix, prefix.size()); } }
/** * This is the main method for the PrefixSpan algorithm that is called to start the algorithm * * @param outputFilePath an output file path if the result should be saved to a file or null if * the result should be saved to memory. * @param database a sequence database * @throws IOException exception if an error while writing the output file */ private void prefixSpan(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // We have to scan the database to find all frequent patterns of size 1. // We note the sequences in which these patterns appear. Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE ITON A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 ) // Create a list of pseudosequence List<PseudoSequence> initialContext = new ArrayList<PseudoSequence>(); // for each sequence in the database for (Sequence sequence : database.getSequences()) { // remove infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if the size is > 0, create a pseudo sequence with this sequence initialContext.add(new PseudoSequence(optimizedSequence, 0, 0)); } } // For each item for (Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent (has a support >= minsup) if (entry.getValue().size() >= minsuppAbsolute) { // if the item is frequent // build the projected context String item = entry.getKey(); List<PseudoSequence> projectedContext = buildProjectedContext(item, initialContext, false); // Create the prefix for the projected context. SequentialPattern prefix = new SequentialPattern(0); prefix.addItemset(new Itemset(item)); prefix.setSequencesID(entry.getValue()); // The prefix is a frequent sequential pattern. // We save it in the result. savePattern(prefix); // we found a sequence. // Recursive call ! recursion(prefix, projectedContext); } } }
/** * This is the "backscan-pruning" strategy described in the BIDE+ paper to avoid extending some * prefixs that are guaranteed to not generate a closed pattern (see the BIDE+ paper for details). * * @param prefix the current prefix * @return boolean true if we should not extend the prefix */ private boolean checkBackScanPruning(SequentialPattern prefix, Set<Integer> sidset) { // // See the BIDE+ paper for details about this method. // For the number of item occurences that can be generated with this prefix: for (int i = 0; i < prefix.getItemOccurencesTotalCount(); i++) { Set<Integer> alreadyVisitedSID = new HashSet<Integer>(); // Create a Map of pairs to count the support of items (represented by a pair) // in the ith semi-maximum periods Map<PairBIDE, PairBIDE> mapPaires = new HashMap<PairBIDE, PairBIDE>(); // SOME CODE USED BY "findAllFrequentPairsForBackwardExtensionCheck" Integer itemI = prefix.getIthItem(i); // iPeriod Integer itemIm1 = null; // iPeriod -1 if (i > 0) { itemIm1 = prefix.getIthItem(i - 1); } // // END NEW int seqCount = 0; // int highestSupportUntilNow = -1; // (1) For each i, we build the list of maximum periods // for each sequence in the original database for (int sequenceID : sidset) { alreadyVisitedSID.add(sequenceID); PseudoSequenceBIDE sequence = initialDatabase.get(sequenceID); PseudoSequenceBIDE period = sequence.getIthSemiMaximumPeriodOfAPrefix(prefix.getItemsets(), i); if (period != null) { // // we add it to the list of maximum periods boolean hasExtension = findAllFrequentPairsForBackwardExtensionCheck( alreadyVisitedSID.size(), prefix, period, i, mapPaires, itemI, itemIm1); if (hasExtension) { return true; } } } } return false; }
/** * This is the "backscan-pruning" strategy described in the BIDE+ paper to avoid extending some * prefixs that are guaranteed to not generate a closed pattern (see the BIDE+ paper for details). * * @param prefix the current prefix * @param projectedContext the projected database * @return boolean true if we should not extend the prefix */ private boolean checkBackScanPruning( SequentialPattern prefix, List<PseudoSequenceBIDE> projectedContext) { // DEBUGGIN if (prefix.size() == 1 && prefix.get(0).get(0) == 5) { System.out.println("PREFIX 5 "); } // // See the BIDE+ paper for details about this method. // For the number of item occurences that can be generated with this prefix: for (int i = 0; i < prefix.getItemOccurencesTotalCount(); i++) { Set<Integer> alreadyVisitedSID = new HashSet<Integer>(); // Create a Map of pairs to count the support of items (represented by a pair) // in the ith semi-maximum periods Map<PairBIDE, PairBIDE> mapPaires = new HashMap<PairBIDE, PairBIDE>(); // SOME CODE USED BY "findAllFrequentPairsForBackwardExtensionCheck" Integer itemI = prefix.getIthItem(i); // iPeriod Integer itemIm1 = null; // iPeriod -1 if (i > 0) { itemIm1 = prefix.getIthItem(i - 1); } // // END NEW // (1) For each i, we build the list of maximum periods // for each sequence in the original database for (PseudoSequenceBIDE pseudoSequence : projectedContext) { int sequenceID = pseudoSequence.sequence.getId(); alreadyVisitedSID.add(pseudoSequence.sequence.getId()); Position currentCutPosition = new Position(pseudoSequence.firstItemset, pseudoSequence.firstItem); PseudoSequenceBIDE sequence = initialDatabase.get(sequenceID); PseudoSequenceBIDE period = sequence.getIthSemiMaximumPeriodOfAPrefix(prefix.getItemsets(), i, currentCutPosition); if (period != null) { // // we add it to the list of maximum periods boolean hasExtension = findAllFrequentPairsForBackwardExtensionCheck( prefix.getAbsoluteSupport(), prefix, period, i, mapPaires, itemI, itemIm1, currentCutPosition); if (hasExtension) { return true; } } } } return false; }
/** * This method creates a copy of the sequence and add a given item as a new itemset to the * sequence. It sets the support of the sequence as the support of the item. * * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToSequence(SequentialPattern prefix, Integer item) { SequentialPattern newPrefix = prefix.cloneSequence(); // isSuffix newPrefix.addItemset(new Itemset(item)); return newPrefix; }
/** * Method to recursively grow a given sequential pattern. * * @param prefix the current sequential pattern that we want to try to grow * @param database the current projected sequence database * @throws IOException exception if there is an error writing to the output file */ private int recursion(SequentialPattern prefix, List<PseudoSequenceBIDE> contexte) throws IOException { // find frequent items of size 1 in the current projected database. Set<PairBIDE> pairs = findAllFrequentPairs(prefix, contexte); // we will keep tract of the maximum support of patterns // that can be found with this prefix, to check // for forward extension when this method returns. int maxSupport = 0; // For each pair found (a pair is an item with a boolean indicating if it // appears in an itemset that is cut (a postfix) or not, and the sequence IDs // where it appears in the projected database). for (PairBIDE pair : pairs) { // if the item is frequent. if (pair.getCount() >= minsuppAbsolute) { // create the new postfix by appending this item to the prefix SequentialPattern newPrefix; // if the item is part of a postfix if (pair.isPostfix()) { // we append it to the last itemset of the prefix newPrefix = appendItemToPrefixOfSequence(prefix, pair.getItem()); // is =<is, (deltaT,i)> } else { // else, we append it as a new itemset to the sequence newPrefix = appendItemToSequence(prefix, pair.getItem()); } // build the projected database with this item // long start = System.currentTimeMillis(); List<PseudoSequenceBIDE> projectedContext = buildProjectedDatabase( pair.getItem(), contexte, pair.isPostfix(), pair.getSequenceIDs()); // debugProjectDBTime += System.currentTimeMillis() - start; // create new prefix newPrefix.setSequenceIDs(pair.getSequenceIDs()); // variable to keep track of the maximum support of extension // with this item and this prefix if (projectedContext.size() >= minsuppAbsolute) { int maxSupportOfSuccessors = 0; if (!checkBackScanPruning(newPrefix, pair.getSequenceIDs())) { maxSupportOfSuccessors = recursion(newPrefix, projectedContext); // récursion } // check the forward extension for the prefix // if no forward extension if (newPrefix.getSequenceIDs().size() != maxSupportOfSuccessors) { // if there is no backward extension if (!checkBackwardExtension(newPrefix, pair.getSequenceIDs())) { // save the pattern savePattern(newPrefix); } } } else { if (!checkBackwardExtension(newPrefix, pair.getSequenceIDs())) { // save the pattern savePattern(newPrefix); } } // record the largest support of patterns found starting // with this prefix until now if (newPrefix.getAbsoluteSupport() > maxSupport) { maxSupport = newPrefix.getAbsoluteSupport(); } } } return maxSupport; // return the maximum support generated by extension of the prefix }
/** * Method to update the support count of item in a maximum period * * @param prefix the current prefix * @param mapPaires * @param maximum periods a maximum period * @return a set of pairs indicating the support of items (note that a pair distinguish between * items in a postfix, prefix...). */ protected boolean findAllFrequentPairsForBackwardExtensionCheck( int seqProcessedCount, SequentialPattern prefix, PseudoSequenceBIDE maximumPeriod, int iPeriod, Map<PairBIDE, PairBIDE> mapPaires, Integer itemI, Integer itemIm1) { int supportToMatch = prefix.getSequenceIDs().size(); int maxPeriodSize = maximumPeriod.size(); // for each itemset in that period for (int i = 0; i < maxPeriodSize; i++) { int sizeOfItemsetAtI = maximumPeriod.getSizeOfItemsetAt(i); // NEW boolean sawI = false; // sawI after current position boolean sawIm1 = false; // sawI-1 before current position // END NEW // NEW march 20 2010 : check if I is after current position in current itemset for (int j = 0; j < sizeOfItemsetAtI; j++) { Integer item = maximumPeriod.getItemAtInItemsetAt(j, i); if (item.equals(itemI)) { sawI = true; } else if (item > itemI) { break; } } // END NEW for (int j = 0; j < sizeOfItemsetAtI; j++) { Integer item = maximumPeriod.getItemAtInItemsetAt(j, i); if (itemIm1 != null && item == itemIm1) { sawIm1 = true; } boolean isPrefix = maximumPeriod.isCutAtRight(i); boolean isPostfix = maximumPeriod.isPostfix(i); // END NEW PairBIDE paire = new PairBIDE(isPrefix, isPostfix, item); if (seqProcessedCount >= minsuppAbsolute) { // $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ // normal case if (addPair(mapPaires, maximumPeriod.getId(), paire, supportToMatch)) { return true; } // NEW: special cases if (sawIm1) { PairBIDE paire2 = new PairBIDE(isPrefix, !isPostfix, item); if (addPair(mapPaires, maximumPeriod.getId(), paire2, supportToMatch)) { return true; } } if (sawI) { PairBIDE paire2 = new PairBIDE(!isPrefix, isPostfix, item); if (addPair(mapPaires, maximumPeriod.getId(), paire2, supportToMatch)) { return true; } } // END NEW } else { // $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ // normal case addPairWithoutCheck(mapPaires, maximumPeriod.getId(), paire); // // NEW: special cases if (sawIm1) { PairBIDE paire2 = new PairBIDE(isPrefix, !isPostfix, item); addPairWithoutCheck(mapPaires, maximumPeriod.getId(), paire2); } if (sawI) { PairBIDE paire2 = new PairBIDE(!isPrefix, isPostfix, item); addPairWithoutCheck(mapPaires, maximumPeriod.getId(), paire2); } // END NEW } } } return false; }
/** * Method to check if a prefix has a backward-extension (see Bide+ article for full details). This * method do it a little bit differently than the BIDE+ article since we iterate with i on * elements of the prefix instead of iterating with a i on the itemsets of the prefix. But the * idea is the same! * * @param prefix the current prefix * @return boolean true, if there is a backward extension */ private boolean checkBackwardExtension(SequentialPattern prefix, Set<Integer> sidset) { // System.out.println("======" + prefix); int totalOccurenceCount = prefix.getItemOccurencesTotalCount(); // For the ith item of the prefix for (int i = 0; i < totalOccurenceCount; i++) { Set<Integer> alreadyVisitedSID = new HashSet<Integer>(); // // SOME CODE USED BY "findAllFrequentPairsForBackwardExtensionCheck" Integer itemI = prefix.getIthItem(i); // iPeriod Integer itemIm1 = null; // iPeriod -1 if (i > 0) { itemIm1 = prefix.getIthItem(i - 1); } // // END NEW // Create a Map of pairs to count the support of items (represented by a pair) // in the ith semi-maximum periods Map<PairBIDE, PairBIDE> mapPaires = new HashMap<PairBIDE, PairBIDE>(); // (1) For each i, we build the list of maximum periods // for each sequence in the original database // int seqCount =0; int highestSupportUntilNow = -1; // 1703 pat - 9391 ms for (int sequenceID : sidset) { // OPTIMIZATION PART 1== DON'T CHECK THE BACK EXTENSION IF THERE IS NOT ENOUGH SEQUENCE // LEFT TO FIND AN EXTENSION // THIS CAN IMPROVE THE PERFORMANCE BY UP TO 30% on FIFA int remainingSeqID = (sidset.size() - alreadyVisitedSID.size()); if (highestSupportUntilNow != -1 && highestSupportUntilNow + remainingSeqID < sidset.size()) { break; } alreadyVisitedSID.add(sequenceID); // if(!alreadyVisitedSID.contains(sequenceID)) { // seqCount++; // alreadyVisitedSID.add(sequenceID); // } // END OF OPTIMIZATION PART 1 (IT CONTINUES A FEW LINES BELOW...) PseudoSequenceBIDE sequence = initialDatabase.get(sequenceID); PseudoSequenceBIDE period = sequence.getIthMaximumPeriodOfAPrefix(prefix.getItemsets(), i); // if the period is not null if (period != null) { boolean hasBackwardExtension = findAllFrequentPairsForBackwardExtensionCheck( alreadyVisitedSID.size(), prefix, period, i, mapPaires, itemI, itemIm1); if (hasBackwardExtension) { // System.out.println(prefix + " has a backward extension from " + i + "th // maxperiod in sequence from seq. " + sequenceID ); return true; } // ===== OPTIMIZATION PART 2 if ((sidset.size() - alreadyVisitedSID.size()) < minsuppAbsolute) { for (PairBIDE pair : mapPaires.values()) { int supportOfPair = pair.getSequenceIDs().size(); if (supportOfPair > highestSupportUntilNow) { highestSupportUntilNow = supportOfPair; // +1 because it may be raised for this sequence... } } } // ===== END OF OPTIMIZATION PART 2 } } } // totaltimeForBackwardExtension += System.currentTimeMillis() - start; return false; // no backward extension, we return false }
/** * This is the main method for the BIDE+ algorithm. * * @param database a sequence database * @throws IOException exception if some error occurs while writing the output file. */ private void bide(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // The algorithm first scan the database to find all frequent items // The algorithm note the sequences in which these items appear. // This is stored in a map: Key: item Value : IDs of sequences containing the item Map<Integer, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE TO A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. // OPTIMIZATION Create COOC MAP // coocMapBefore = new HashMap<Integer, Map<Integer, // Integer>>(mapSequenceID.entrySet().size()); // we create a database initialDatabase = new ArrayList<PseudoSequenceBIDE>(); // for each sequence of the original database for (Sequence sequence : database.getSequences()) { // we make a copy of the sequence while removing infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if this sequence has size >0, we add it to the new database initialDatabase.add(new PseudoSequenceBIDE(optimizedSequence, 0, 0)); } // // update COOC map // HashSet<Integer> alreadySeen = new HashSet<Integer>(); // for(List<Integer> itemset : optimizedSequence.getItemsets()) { // for(Integer item : itemset) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(item); // if(mapCoocItem == null) { // mapCoocItem = new HashMap<Integer, Integer>(); // coocMapBefore.put(item, mapCoocItem); // } // for(Integer itemSeen : alreadySeen) { // if(itemSeen != item) { // Integer frequency = mapCoocItem.get(itemSeen); // if(frequency == null) { // mapCoocItem.put(itemSeen, 1); // }else { // mapCoocItem.put(itemSeen, frequency+1); // } // } // } // alreadySeen.add(item); // } // } } // For each frequent item loop1: for (Entry<Integer, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent if (entry.getValue().size() >= minsuppAbsolute) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(entry.getKey()); // if(mapCoocItem != null) { // for(Integer supportCoocBefore : mapCoocItem.values()) { // if(supportCoocBefore >= entry.getValue().size()) { // continue loop1; // } // } // } // build the projected database with this item Integer item = entry.getKey(); List<PseudoSequenceBIDE> projectedContext = buildProjectedContextSingleItem(item, initialDatabase, false, entry.getValue()); // Create the prefix with this item SequentialPattern prefix = new SequentialPattern(); prefix.addItemset(new Itemset(item)); // set the sequence IDS of this prefix prefix.setSequenceIDs(entry.getValue()); // variable to store the largest support of patterns // that will be found starting with this prefix if (projectedContext.size() >= minsuppAbsolute) { int successorSupport = 0; if (!checkBackScanPruning(prefix, entry.getValue())) { successorSupport = recursion(prefix, projectedContext); // récursion; } // Finally, because this prefix has support > minsup // and passed the backscan pruning, // we check if it has no sucessor with support >= minsup // (a forward extension) // IF no forward extension if (successorSupport != entry.getValue().size()) { // ######### MODIFICATION #### // IF there is also no backward extension if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } else { if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } } // check the memory usage for statistics MemoryLogger.getInstance().checkMemory(); }
/** * This method creates a copy of the sequence and add a given item to the last itemset of the * sequence. It sets the support of the sequence as the support of the item. * * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToPrefixOfSequence(SequentialPattern prefix, String item) { SequentialPattern newPrefix = prefix.cloneSequence(); Itemset itemset = newPrefix.get(newPrefix.size() - 1); itemset.addItem(item); return newPrefix; }
/** * This method creates a copy of the sequence and add a given item as a new itemset to the * sequence. It sets the support of the sequence as the support of the item. * * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToSequence(SequentialPattern prefix, String item) { SequentialPattern newPrefix = prefix.cloneSequence(); newPrefix.addItemset(new Itemset(item)); return newPrefix; }