/** * For each item, calculate the sequence id of sequences containing that item * * @param database the current sequence database * @return Map of items to sequence IDs that contains each item */ private Map<Integer, Set<Integer>> findSequencesContainingItems(SequenceDatabase database) { // We use a map to store the sequence IDs where an item appear // Key : item Value : a set of sequence IDs Map<Integer, Set<Integer>> mapSequenceID = new HashMap< Integer, Set< Integer>>(); // pour conserver les ID des séquences: <Id Item, Set d'id de // séquences> // for each sequence for (Sequence sequence : database.getSequences()) { // for each itemset in that sequence for (List<Integer> itemset : sequence.getItemsets()) { // for each item for (Integer item : itemset) { // get the set of sequence ids for that item Set<Integer> sequenceIDs = mapSequenceID.get(item); if (sequenceIDs == null) { // if null create a new set sequenceIDs = new HashSet<Integer>(); mapSequenceID.put(item, sequenceIDs); } // add the current sequence id to this set sequenceIDs.add(sequence.getId()); } } } return mapSequenceID; }
public static SequenceDatabase load(List<List<Integer>>[] aryListDB) { SequenceDatabase db = new SequenceDatabase(); Sequence seq; for (int i = 0; i < aryListDB.length; i++) { if (aryListDB[i] != null) { seq = new Sequence(i); for (List<Integer> itemset : aryListDB[i]) { seq.addItemset(itemset); } db.addSequence(seq); } } return db; }
public static SequenceDatabase load(String strDB) { SequenceDatabase db = new SequenceDatabase(); Sequence seq; List<Integer> iset; String[] sequences = strDB.split("\\n"); String[] itemsets; String[] items; for (String seqStr : sequences) { itemsets = seqStr.trim().split("\\s*\\|\\s*"); seq = new Sequence(Integer.valueOf(itemsets[0])); for (int i = 1; i < itemsets.length; i++) { items = itemsets[i].split("\\s+"); iset = new ArrayList<Integer>(); for (String itemStr : items) { iset.add(Integer.valueOf(itemStr)); } seq.addItemset(iset); } db.addSequence(seq); } return db; }
/** * This is the main method for the BIDE+ algorithm. * * @param database a sequence database * @throws IOException exception if some error occurs while writing the output file. */ private void bide(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // The algorithm first scan the database to find all frequent items // The algorithm note the sequences in which these items appear. // This is stored in a map: Key: item Value : IDs of sequences containing the item Map<Integer, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE TO A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. // OPTIMIZATION Create COOC MAP // coocMapBefore = new HashMap<Integer, Map<Integer, // Integer>>(mapSequenceID.entrySet().size()); // we create a database initialDatabase = new ArrayList<PseudoSequenceBIDE>(); // for each sequence of the original database for (Sequence sequence : database.getSequences()) { // we make a copy of the sequence while removing infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if this sequence has size >0, we add it to the new database initialDatabase.add(new PseudoSequenceBIDE(optimizedSequence, 0, 0)); } // // update COOC map // HashSet<Integer> alreadySeen = new HashSet<Integer>(); // for(List<Integer> itemset : optimizedSequence.getItemsets()) { // for(Integer item : itemset) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(item); // if(mapCoocItem == null) { // mapCoocItem = new HashMap<Integer, Integer>(); // coocMapBefore.put(item, mapCoocItem); // } // for(Integer itemSeen : alreadySeen) { // if(itemSeen != item) { // Integer frequency = mapCoocItem.get(itemSeen); // if(frequency == null) { // mapCoocItem.put(itemSeen, 1); // }else { // mapCoocItem.put(itemSeen, frequency+1); // } // } // } // alreadySeen.add(item); // } // } } // For each frequent item loop1: for (Entry<Integer, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent if (entry.getValue().size() >= minsuppAbsolute) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(entry.getKey()); // if(mapCoocItem != null) { // for(Integer supportCoocBefore : mapCoocItem.values()) { // if(supportCoocBefore >= entry.getValue().size()) { // continue loop1; // } // } // } // build the projected database with this item Integer item = entry.getKey(); List<PseudoSequenceBIDE> projectedContext = buildProjectedContextSingleItem(item, initialDatabase, false, entry.getValue()); // Create the prefix with this item SequentialPattern prefix = new SequentialPattern(); prefix.addItemset(new Itemset(item)); // set the sequence IDS of this prefix prefix.setSequenceIDs(entry.getValue()); // variable to store the largest support of patterns // that will be found starting with this prefix if (projectedContext.size() >= minsuppAbsolute) { int successorSupport = 0; if (!checkBackScanPruning(prefix, entry.getValue())) { successorSupport = recursion(prefix, projectedContext); // récursion; } // Finally, because this prefix has support > minsup // and passed the backscan pruning, // we check if it has no sucessor with support >= minsup // (a forward extension) // IF no forward extension if (successorSupport != entry.getValue().size()) { // ######### MODIFICATION #### // IF there is also no backward extension if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } else { if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } } // check the memory usage for statistics MemoryLogger.getInstance().checkMemory(); }
/** * This method generates statistics for a sequence database (a file) * * @param path the path to the file * @throws IOException exception if there is a problem while reading the file. */ public void getStats(String path) throws IOException { ///////////////////////////////////// // (1) First we will read the sequence database into memory. // (actually, we don't really need to read it into memory because it // just require a single pass, but the code is more simple like that // - it could be optimized, if necessary). /////////////////////////////////// List<Sequence> sequences = new ArrayList<Sequence>(); // A sequence database is stored as a list of sequences int maxItem = 0; // the largest id for items in the database String thisLine; // a temporary variable to read each line from the file BufferedReader myInput = null; try { // we read the file line by line FileInputStream fin = new FileInputStream(new File(path)); myInput = new BufferedReader(new InputStreamReader(fin)); int i = 0; // used to count the lines. // for each line until the end of the file while ((thisLine = myInput.readLine()) != null) { // we split the line according to spaces into tokens String tokens[] = thisLine.split(" "); // we create a new sequence object to store the sequence that correspond to this line. Sequence sequence = new Sequence(i++); // we create a list of integer to store the current itemset from the sequence // that correspond to this line. List<Integer> itemset = new ArrayList<Integer>(); // For each token for (String token : tokens) { // if the token starts with "<" it means that it is a timestamp if (token.codePointAt(0) == '<') { // we just ignore it for statistics.. } // if the token is "-1" it means that it is the end of an itemset else if (token.equals("-1")) { // we add the itemset to the sequence sequence.addItemset(itemset); // we reset the variable itemset to read the next itemset itemset = new ArrayList<Integer>(); } // if the token is "-2", it indicates the end of this sequence and the // end of the line else if (token.equals("-2")) { // we add the sequence to the list of sequences sequences.add(sequence); } // otherwise, it means that the token is an item else { // we convert to an integer Integer item = Integer.parseInt(token); // we check if it has the largest value because we // want to keep this information if (item >= maxItem) { maxItem = item; } // we add the item to the current itemset. itemset.add(item); } } } } catch (Exception e) { e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } ///////////////////////////////////// // We finished reading the database into memory. // We will calculate statistics on this sequence database. /////////////////////////////////// System.out.println("============ SEQUENCE DATABASE STATS =========="); System.out.println("Number of sequences : " + sequences.size()); // we initialize some variables that we will use to generate the statistics java.util.Set<Integer> items = new java.util.HashSet<Integer>(); // the set of all items List<Integer> sizes = new ArrayList<Integer>(); // the lengths of each sequence List<Integer> itemsetsizes = new ArrayList<Integer>(); // the lengths of each itemset List<Integer> differentitems = new ArrayList<Integer>(); // the number of different item for each sequence List<Integer> appearXtimesbySequence = new ArrayList< Integer>(); // the average number of times that items appearing in a sequence, appears // in this sequence. // Loop on sequences from the database for (Sequence sequence : sequences) { // we add the size of this sequence to the list of sizes sizes.add(sequence.size()); // this map is used to calculate the number of times that each item // appear in this sequence. // the key is an item // the value is the number of occurences of the item until now for this sequence HashMap<Integer, Integer> mapIntegers = new HashMap<Integer, Integer>(); // Loop on itemsets from this sequence for (List<Integer> itemset : sequence.getItemsets()) { // we add the size of this itemset to the list of itemset sizes itemsetsizes.add(itemset.size()); // Loop on items from this itemset for (Integer item : itemset) { // If the item is not in the map already, we set count to 0 Integer count = mapIntegers.get(item); if (count == null) { count = 0; } // otherwise we set the count to count +1 count = count + 1; mapIntegers.put(item, count); // finally, we add the item to the set of items items.add(item); } } // we add all items found in this sequence to the global list // of different items for the database differentitems.add(mapIntegers.entrySet().size()); // for each item appearing in this sequence, // we put the number of times in a global list "appearXtimesbySequence" // previously described. for (Entry<Integer, Integer> entry : mapIntegers.entrySet()) { appearXtimesbySequence.add(entry.getValue()); } } // we print the statistics System.out.println("File " + path); System.out.println("Number of distinct items: " + items.size()); System.out.println("Largest item id: " + maxItem); System.out.println( "Average number of itemsets per sequence : " + calculateMean(sizes) + " standard deviation: " + calculateStdDeviation(sizes) + " variance: " + calculateVariance(sizes)); System.out.println( "Average number of distinct item per sequence : " + calculateMean(differentitems) + " standard deviation: " + calculateStdDeviation(differentitems) + " variance: " + calculateVariance(differentitems)); System.out.println( "Average number of occurences in a sequence for each item appearing in a sequence : " + calculateMean(appearXtimesbySequence) + " standard deviation: " + calculateStdDeviation(appearXtimesbySequence) + " variance: " + calculateVariance(appearXtimesbySequence)); System.out.println( "Average number of items per itemset : " + calculateMean(itemsetsizes) + " standard deviation: " + calculateStdDeviation(itemsetsizes) + " variance: " + calculateVariance(itemsetsizes)); }