/** * This is the main method for the PrefixSpan algorithm that is called to start the algorithm * * @param outputFilePath an output file path if the result should be saved to a file or null if * the result should be saved to memory. * @param database a sequence database * @throws IOException exception if an error while writing the output file */ private void prefixSpan(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // We have to scan the database to find all frequent patterns of size 1. // We note the sequences in which these patterns appear. Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE ITON A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. (OPTIMIZATION : OCTOBER-08 ) // Create a list of pseudosequence List<PseudoSequence> initialContext = new ArrayList<PseudoSequence>(); // for each sequence in the database for (Sequence sequence : database.getSequences()) { // remove infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if the size is > 0, create a pseudo sequence with this sequence initialContext.add(new PseudoSequence(optimizedSequence, 0, 0)); } } // For each item for (Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent (has a support >= minsup) if (entry.getValue().size() >= minsuppAbsolute) { // if the item is frequent // build the projected context String item = entry.getKey(); List<PseudoSequence> projectedContext = buildProjectedContext(item, initialContext, false); // Create the prefix for the projected context. SequentialPattern prefix = new SequentialPattern(0); prefix.addItemset(new Itemset(item)); prefix.setSequencesID(entry.getValue()); // The prefix is a frequent sequential pattern. // We save it in the result. savePattern(prefix); // we found a sequence. // Recursive call ! recursion(prefix, projectedContext); } } }
/** * This method creates a copy of the sequence and add a given item as a new itemset to the * sequence. It sets the support of the sequence as the support of the item. * * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToSequence(SequentialPattern prefix, Integer item) { SequentialPattern newPrefix = prefix.cloneSequence(); // isSuffix newPrefix.addItemset(new Itemset(item)); return newPrefix; }
/** * This is the main method for the BIDE+ algorithm. * * @param database a sequence database * @throws IOException exception if some error occurs while writing the output file. */ private void bide(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // The algorithm first scan the database to find all frequent items // The algorithm note the sequences in which these items appear. // This is stored in a map: Key: item Value : IDs of sequences containing the item Map<Integer, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE TO A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. // OPTIMIZATION Create COOC MAP // coocMapBefore = new HashMap<Integer, Map<Integer, // Integer>>(mapSequenceID.entrySet().size()); // we create a database initialDatabase = new ArrayList<PseudoSequenceBIDE>(); // for each sequence of the original database for (Sequence sequence : database.getSequences()) { // we make a copy of the sequence while removing infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if this sequence has size >0, we add it to the new database initialDatabase.add(new PseudoSequenceBIDE(optimizedSequence, 0, 0)); } // // update COOC map // HashSet<Integer> alreadySeen = new HashSet<Integer>(); // for(List<Integer> itemset : optimizedSequence.getItemsets()) { // for(Integer item : itemset) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(item); // if(mapCoocItem == null) { // mapCoocItem = new HashMap<Integer, Integer>(); // coocMapBefore.put(item, mapCoocItem); // } // for(Integer itemSeen : alreadySeen) { // if(itemSeen != item) { // Integer frequency = mapCoocItem.get(itemSeen); // if(frequency == null) { // mapCoocItem.put(itemSeen, 1); // }else { // mapCoocItem.put(itemSeen, frequency+1); // } // } // } // alreadySeen.add(item); // } // } } // For each frequent item loop1: for (Entry<Integer, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent if (entry.getValue().size() >= minsuppAbsolute) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(entry.getKey()); // if(mapCoocItem != null) { // for(Integer supportCoocBefore : mapCoocItem.values()) { // if(supportCoocBefore >= entry.getValue().size()) { // continue loop1; // } // } // } // build the projected database with this item Integer item = entry.getKey(); List<PseudoSequenceBIDE> projectedContext = buildProjectedContextSingleItem(item, initialDatabase, false, entry.getValue()); // Create the prefix with this item SequentialPattern prefix = new SequentialPattern(); prefix.addItemset(new Itemset(item)); // set the sequence IDS of this prefix prefix.setSequenceIDs(entry.getValue()); // variable to store the largest support of patterns // that will be found starting with this prefix if (projectedContext.size() >= minsuppAbsolute) { int successorSupport = 0; if (!checkBackScanPruning(prefix, entry.getValue())) { successorSupport = recursion(prefix, projectedContext); // récursion; } // Finally, because this prefix has support > minsup // and passed the backscan pruning, // we check if it has no sucessor with support >= minsup // (a forward extension) // IF no forward extension if (successorSupport != entry.getValue().size()) { // ######### MODIFICATION #### // IF there is also no backward extension if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } else { if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } } // check the memory usage for statistics MemoryLogger.getInstance().checkMemory(); }
/** * This method creates a copy of the sequence and add a given item as a new itemset to the * sequence. It sets the support of the sequence as the support of the item. * * @param prefix the sequence * @param item the item * @return the new sequence */ private SequentialPattern appendItemToSequence(SequentialPattern prefix, String item) { SequentialPattern newPrefix = prefix.cloneSequence(); newPrefix.addItemset(new Itemset(item)); return newPrefix; }