/** * This is the "backscan-pruning" strategy described in the BIDE+ paper to avoid extending some * prefixs that are guaranteed to not generate a closed pattern (see the BIDE+ paper for details). * * @param prefix the current prefix * @param projectedContext the projected database * @return boolean true if we should not extend the prefix */ private boolean checkBackScanPruning( SequentialPattern prefix, List<PseudoSequenceBIDE> projectedContext) { // DEBUGGIN if (prefix.size() == 1 && prefix.get(0).get(0) == 5) { System.out.println("PREFIX 5 "); } // // See the BIDE+ paper for details about this method. // For the number of item occurences that can be generated with this prefix: for (int i = 0; i < prefix.getItemOccurencesTotalCount(); i++) { Set<Integer> alreadyVisitedSID = new HashSet<Integer>(); // Create a Map of pairs to count the support of items (represented by a pair) // in the ith semi-maximum periods Map<PairBIDE, PairBIDE> mapPaires = new HashMap<PairBIDE, PairBIDE>(); // SOME CODE USED BY "findAllFrequentPairsForBackwardExtensionCheck" Integer itemI = prefix.getIthItem(i); // iPeriod Integer itemIm1 = null; // iPeriod -1 if (i > 0) { itemIm1 = prefix.getIthItem(i - 1); } // // END NEW // (1) For each i, we build the list of maximum periods // for each sequence in the original database for (PseudoSequenceBIDE pseudoSequence : projectedContext) { int sequenceID = pseudoSequence.sequence.getId(); alreadyVisitedSID.add(pseudoSequence.sequence.getId()); Position currentCutPosition = new Position(pseudoSequence.firstItemset, pseudoSequence.firstItem); PseudoSequenceBIDE sequence = initialDatabase.get(sequenceID); PseudoSequenceBIDE period = sequence.getIthSemiMaximumPeriodOfAPrefix(prefix.getItemsets(), i, currentCutPosition); if (period != null) { // // we add it to the list of maximum periods boolean hasExtension = findAllFrequentPairsForBackwardExtensionCheck( prefix.getAbsoluteSupport(), prefix, period, i, mapPaires, itemI, itemIm1, currentCutPosition); if (hasExtension) { return true; } } } } return false; }
/** * This is the "backscan-pruning" strategy described in the BIDE+ paper to avoid extending some * prefixs that are guaranteed to not generate a closed pattern (see the BIDE+ paper for details). * * @param prefix the current prefix * @return boolean true if we should not extend the prefix */ private boolean checkBackScanPruning(SequentialPattern prefix, Set<Integer> sidset) { // // See the BIDE+ paper for details about this method. // For the number of item occurences that can be generated with this prefix: for (int i = 0; i < prefix.getItemOccurencesTotalCount(); i++) { Set<Integer> alreadyVisitedSID = new HashSet<Integer>(); // Create a Map of pairs to count the support of items (represented by a pair) // in the ith semi-maximum periods Map<PairBIDE, PairBIDE> mapPaires = new HashMap<PairBIDE, PairBIDE>(); // SOME CODE USED BY "findAllFrequentPairsForBackwardExtensionCheck" Integer itemI = prefix.getIthItem(i); // iPeriod Integer itemIm1 = null; // iPeriod -1 if (i > 0) { itemIm1 = prefix.getIthItem(i - 1); } // // END NEW int seqCount = 0; // int highestSupportUntilNow = -1; // (1) For each i, we build the list of maximum periods // for each sequence in the original database for (int sequenceID : sidset) { alreadyVisitedSID.add(sequenceID); PseudoSequenceBIDE sequence = initialDatabase.get(sequenceID); PseudoSequenceBIDE period = sequence.getIthSemiMaximumPeriodOfAPrefix(prefix.getItemsets(), i); if (period != null) { // // we add it to the list of maximum periods boolean hasExtension = findAllFrequentPairsForBackwardExtensionCheck( alreadyVisitedSID.size(), prefix, period, i, mapPaires, itemI, itemIm1); if (hasExtension) { return true; } } } } return false; }
/** * Create a projected database by pseudo-projection * * @param item The item to use to make the pseudo-projection * @param context The current database. * @param inSuffix This boolean indicates if the item "item" is part of a suffix or not. * @return the projected database. */ private List<PseudoSequenceBIDE> buildProjectedDatabase( Integer item, List<PseudoSequenceBIDE> database, boolean inSuffix, Set<Integer> sidset) { // The projected pseudo-database List<PseudoSequenceBIDE> sequenceDatabase = new ArrayList<PseudoSequenceBIDE>(); // for each sequence loop1: for (PseudoSequenceBIDE sequence : database) { if (sidset.contains(sequence.getId()) == false) { continue; } // for each item of the sequence for (int i = 0; i < sequence.size(); i++) { int sizeOfItemsetAti = sequence.getSizeOfItemsetAt(i); // check if the itemset contains the item that we use for the projection int index = sequence.indexOf(sizeOfItemsetAti, i, item); // if it does not, and the current item is part of a suffix if inSuffix is true // and vice-versa if (index != -1 && sequence.isPostfix(i) == inSuffix) { if (index != sizeOfItemsetAti - 1) { // if this is not the last item of the itemset // create a new pseudo sequence // add it to the projected database. sequenceDatabase.add(new PseudoSequenceBIDE(sequence, i, index + 1)); // continue loop1; // } } else if ((i != sequence.size() - 1)) { // if this is not the last itemset of the sequence // create a new pseudo sequence // add it to the projected database. sequenceDatabase.add(new PseudoSequenceBIDE(sequence, i + 1, 0)); // continue loop1; } } } } return sequenceDatabase; // return the projected database }
/** * Method to find all frequent items in a projected sequence database * * @param sequences the set of sequences * @return A list of pairs, where a pair is an item with (1) booleans indicating if it is in an * itemset that is "cut" at left or right (prefix or postfix) and (2) the sequence IDs where * it occurs. */ protected Set<PairBIDE> findAllFrequentPairs( SequentialPattern prefix, List<PseudoSequenceBIDE> sequences) { // We use a Map the store the pairs. Map<PairBIDE, PairBIDE> mapPairs = new HashMap<PairBIDE, PairBIDE>(); // for each sequence for (PseudoSequenceBIDE sequence : sequences) { // for each itemset for (int i = 0; i < sequence.size(); i++) { // for each item for (int j = 0; j < sequence.getSizeOfItemsetAt(i); j++) { Integer item = sequence.getItemAtInItemsetAt(j, i); // create the pair corresponding to this item PairBIDE pair = new PairBIDE(sequence.isCutAtRight(i), sequence.isPostfix(i), item); // register this sequenceID for that pair. addPairWithoutCheck(mapPairs, sequence.getId(), pair); } } } // check the memory usage MemoryLogger.getInstance().checkMemory(); return mapPairs.keySet(); // return the pairs. }
/** * Create a projected database by pseudo-projection * * @param item The item to use to make the pseudo-projection * @param context The current database. * @param inSuffix This boolean indicates if the item "item" is part of a suffix or not. * @return the projected database. */ private List<PseudoSequenceBIDE> buildProjectedContextSingleItem( Integer item, Map<Integer, PseudoSequenceBIDE> initialDatabase2, boolean inSuffix, Set<Integer> sidset) { // The projected pseudo-database List<PseudoSequenceBIDE> sequenceDatabase = new ArrayList<PseudoSequenceBIDE>(); // for each sequence loop1: for (int sid : sidset) { PseudoSequenceBIDE sequence = initialDatabase2.get(sid); // for each itemset of the sequence for (int i = 0; i < sequence.size(); i++) { int sizeOfItemsetAti = sequence.getSizeOfItemsetAt(i); // find the position of the item used for the projection in this itemset if it appears int index = sequence.indexOf(sizeOfItemsetAti, i, item); // if it does appear and it is in a postfix/suffix if the item is in a postfix/suffix if (index != -1 && sequence.isPostfix(i) == inSuffix) { // if this is not the last item of the itemset if (index != sizeOfItemsetAti - 1) { // create a new pseudo sequence sequenceDatabase.add(new PseudoSequenceBIDE(sequence, i, index + 1)); // continue loop1; } else if (i != sequence.size() - 1) { // if this is not the last itemset of the sequence // create a new pseudo sequence // if the size of this pseudo sequence is greater than 0 // add it to the projected database. sequenceDatabase.add(new PseudoSequenceBIDE(sequence, i + 1, 0)); // continue loop1; } } } } return sequenceDatabase; // return the projected database }
/** * Method to update the support count of item in a maximum period * * @param prefix the current prefix * @param mapPaires * @param maximum periods a maximum period * @return a set of pairs indicating the support of items (note that a pair distinguish between * items in a postfix, prefix...). */ protected boolean findAllFrequentPairsForBackwardExtensionCheck( int seqProcessedCount, SequentialPattern prefix, PseudoSequenceBIDE maximumPeriod, int iPeriod, Map<PairBIDE, PairBIDE> mapPaires, Integer itemI, Integer itemIm1) { int supportToMatch = prefix.getSequenceIDs().size(); int maxPeriodSize = maximumPeriod.size(); // for each itemset in that period for (int i = 0; i < maxPeriodSize; i++) { int sizeOfItemsetAtI = maximumPeriod.getSizeOfItemsetAt(i); // NEW boolean sawI = false; // sawI after current position boolean sawIm1 = false; // sawI-1 before current position // END NEW // NEW march 20 2010 : check if I is after current position in current itemset for (int j = 0; j < sizeOfItemsetAtI; j++) { Integer item = maximumPeriod.getItemAtInItemsetAt(j, i); if (item.equals(itemI)) { sawI = true; } else if (item > itemI) { break; } } // END NEW for (int j = 0; j < sizeOfItemsetAtI; j++) { Integer item = maximumPeriod.getItemAtInItemsetAt(j, i); if (itemIm1 != null && item == itemIm1) { sawIm1 = true; } boolean isPrefix = maximumPeriod.isCutAtRight(i); boolean isPostfix = maximumPeriod.isPostfix(i); // END NEW PairBIDE paire = new PairBIDE(isPrefix, isPostfix, item); if (seqProcessedCount >= minsuppAbsolute) { // $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ // normal case if (addPair(mapPaires, maximumPeriod.getId(), paire, supportToMatch)) { return true; } // NEW: special cases if (sawIm1) { PairBIDE paire2 = new PairBIDE(isPrefix, !isPostfix, item); if (addPair(mapPaires, maximumPeriod.getId(), paire2, supportToMatch)) { return true; } } if (sawI) { PairBIDE paire2 = new PairBIDE(!isPrefix, isPostfix, item); if (addPair(mapPaires, maximumPeriod.getId(), paire2, supportToMatch)) { return true; } } // END NEW } else { // $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ // normal case addPairWithoutCheck(mapPaires, maximumPeriod.getId(), paire); // // NEW: special cases if (sawIm1) { PairBIDE paire2 = new PairBIDE(isPrefix, !isPostfix, item); addPairWithoutCheck(mapPaires, maximumPeriod.getId(), paire2); } if (sawI) { PairBIDE paire2 = new PairBIDE(!isPrefix, isPostfix, item); addPairWithoutCheck(mapPaires, maximumPeriod.getId(), paire2); } // END NEW } } } return false; }
/** * Method to check if a prefix has a backward-extension (see Bide+ article for full details). This * method do it a little bit differently than the BIDE+ article since we iterate with i on * elements of the prefix instead of iterating with a i on the itemsets of the prefix. But the * idea is the same! * * @param prefix the current prefix * @return boolean true, if there is a backward extension */ private boolean checkBackwardExtension(SequentialPattern prefix, Set<Integer> sidset) { // System.out.println("======" + prefix); int totalOccurenceCount = prefix.getItemOccurencesTotalCount(); // For the ith item of the prefix for (int i = 0; i < totalOccurenceCount; i++) { Set<Integer> alreadyVisitedSID = new HashSet<Integer>(); // // SOME CODE USED BY "findAllFrequentPairsForBackwardExtensionCheck" Integer itemI = prefix.getIthItem(i); // iPeriod Integer itemIm1 = null; // iPeriod -1 if (i > 0) { itemIm1 = prefix.getIthItem(i - 1); } // // END NEW // Create a Map of pairs to count the support of items (represented by a pair) // in the ith semi-maximum periods Map<PairBIDE, PairBIDE> mapPaires = new HashMap<PairBIDE, PairBIDE>(); // (1) For each i, we build the list of maximum periods // for each sequence in the original database // int seqCount =0; int highestSupportUntilNow = -1; // 1703 pat - 9391 ms for (int sequenceID : sidset) { // OPTIMIZATION PART 1== DON'T CHECK THE BACK EXTENSION IF THERE IS NOT ENOUGH SEQUENCE // LEFT TO FIND AN EXTENSION // THIS CAN IMPROVE THE PERFORMANCE BY UP TO 30% on FIFA int remainingSeqID = (sidset.size() - alreadyVisitedSID.size()); if (highestSupportUntilNow != -1 && highestSupportUntilNow + remainingSeqID < sidset.size()) { break; } alreadyVisitedSID.add(sequenceID); // if(!alreadyVisitedSID.contains(sequenceID)) { // seqCount++; // alreadyVisitedSID.add(sequenceID); // } // END OF OPTIMIZATION PART 1 (IT CONTINUES A FEW LINES BELOW...) PseudoSequenceBIDE sequence = initialDatabase.get(sequenceID); PseudoSequenceBIDE period = sequence.getIthMaximumPeriodOfAPrefix(prefix.getItemsets(), i); // if the period is not null if (period != null) { boolean hasBackwardExtension = findAllFrequentPairsForBackwardExtensionCheck( alreadyVisitedSID.size(), prefix, period, i, mapPaires, itemI, itemIm1); if (hasBackwardExtension) { // System.out.println(prefix + " has a backward extension from " + i + "th // maxperiod in sequence from seq. " + sequenceID ); return true; } // ===== OPTIMIZATION PART 2 if ((sidset.size() - alreadyVisitedSID.size()) < minsuppAbsolute) { for (PairBIDE pair : mapPaires.values()) { int supportOfPair = pair.getSequenceIDs().size(); if (supportOfPair > highestSupportUntilNow) { highestSupportUntilNow = supportOfPair; // +1 because it may be raised for this sequence... } } } // ===== END OF OPTIMIZATION PART 2 } } } // totaltimeForBackwardExtension += System.currentTimeMillis() - start; return false; // no backward extension, we return false }