/** * Actual call to SPAM algorithm. The output can be either kept or ignore. Whenever we choose to * keep the patterns found, we can keep them in a file or in the main memory * * @param database Original database in where we want to search for the frequent patterns. * @param keepPatterns Flag indicating if we want to keep the output or not * @param verbose Flag for debugging purposes * @param outputFilePath Path of the file in which we want to store the frequent patterns. If this * value is null, we keep the patterns in the main memory. This argument is taken into account * just when keepPatterns is activated. * @throws IOException */ public void runAlgorithm( SequenceDatabase database, boolean keepPatterns, boolean verbose, String outputFilePath) throws IOException { // If we do no have any file path if (outputFilePath == null) { // The user wants to save the results in memory saver = new SaverIntoMemory(); } else { // Otherwise, the user wants to save them in the given file saver = new SaverIntoFile(outputFilePath); } this.minSupAbsolute = (int) Math.ceil(minSupRelative * database.size()); if (this.minSupAbsolute == 0) { // protection this.minSupAbsolute = 1; } // reset the stats about memory usage MemoryLogger.getInstance().reset(); // keeping the starting time start = System.currentTimeMillis(); // We run SPAM algorithm runSPAM(database, (long) minSupAbsolute, keepPatterns, verbose); // keeping the ending time end = System.currentTimeMillis(); // Search for frequent patterns: Finished saver.finish(); }
/** * Method to find all frequent items in a projected sequence database * * @param sequences the set of sequences * @return A list of pairs, where a pair is an item with (1) a boolean indicating if it is in an * itemset that is "cut" and (2) the sequence IDs where it occurs. */ protected Set<Pair> findAllFrequentPairs( SequentialPattern prefix, List<PseudoSequence> sequences) { // We use a Map the store the pairs. Map<Pair, Pair> mapPairs = new HashMap<Pair, Pair>(); // for each sequence for (PseudoSequence sequence : sequences) { // for each itemset for (int i = 0; i < sequence.size(); i++) { // for each item for (int j = 0; j < sequence.getSizeOfItemsetAt(i); j++) { String item = sequence.getItemAtInItemsetAt(j, i); // create the pair corresponding to this item Pair paire = new Pair(sequence.isPostfix(i), item); // false is ok? // get the pair object store in the map if there is one already Pair oldPaire = mapPairs.get(paire); // if there is no pair object yet if (oldPaire == null) { // store the pair object that we created mapPairs.put(paire, paire); } else { // otherwise use the old one paire = oldPaire; } // record the current sequence id for that pair paire.getSequencesID().add(sequence.getId()); } } } MemoryLogger.getInstance().checkMemory(); // check the memory for statistics. // return the map of pairs return mapPairs.keySet(); }
/** Print the statistics of the algorithm execution to System.out. */ public void printStatistics() { StringBuilder r = new StringBuilder(200); r.append("============= Algorithm VMSP - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Frequent sequences count : " + patternCount); r.append('\n'); r.append(" Max memory (mb) : "); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append(patternCount); r.append('\n'); r.append("minsup " + minsup); r.append('\n'); r.append("Intersection count " + Bitmap.INTERSECTION_COUNT + " \n"); r.append("===================================================\n"); // // PRINT PATTERNS // System.out.println("PATTERNS FOUND ==============="); // for(Entry<Integer, List<Pattern>> entry : maxPatterns.entrySet()) { // for(Pattern pat1: entry.getValue()) { // System.out.println(pat1.prefix.toString()); // // for(Entry<Integer, List<Pattern>> entry2 : maxPatterns.entrySet()) { // for(Pattern pat2: entry2.getValue()) { // if(pat1 != pat2 && strictlyContains(pat1.prefix, pat2.prefix)) { // System.out.println("REDUNDANT : " + pat1.prefix + " " + pat2.prefix); // } // // } // } // } // } System.out.println(r.toString()); }
/** * Run the algorithm. * * @param k the value of k. * @param minConfidence the minimum confidence threshold. * @param database the database. */ public void runAlgorithm(int k, double minConfidence, Database database) { // reset statistics MemoryLogger.getInstance().reset(); // reset utility to check memory usage maxCandidateCount = 0; // save parameters this.minConfidence = minConfidence; this.database = database; this.k = k; // prepare internal variables and structures this.minsuppRelative = 1; tableItemTids = new BitSet[database.maxItem + 1]; // id item, count tableItemCount = new int[database.maxItem + 1]; kRules = new PriorityQueue<RuleG>(); candidates = new RedBlackTree<RuleG>(); // record the start time timeStart = System.currentTimeMillis(); // perform the first database scan to generate vertical database representation scanDatabase(database); // start the generation of rules start(); // record the end time timeEnd = System.currentTimeMillis(); }
/** * Method to recursively grow a given sequential pattern. * * @param prefix the current sequential pattern that we want to try to grow * @param database the current projected sequence database * @throws IOException exception if there is an error writing to the output file */ private void recursion(SequentialPattern prefix, List<PseudoSequence> database) throws IOException { // find frequent items of size 1 in the current projected database. Set<Pair> pairs = findAllFrequentPairs(prefix, database); // For each pair found (a pair is an item with a boolean indicating if it // appears in an itemset that is cut (a postfix) or not, and the sequence IDs // where it appears in the projected database). for (Pair pair : pairs) { // if the item is frequent in the current projected database if (pair.getCount() >= minsuppAbsolute) { // create the new postfix by appending this item to the prefix SequentialPattern newPrefix; // if the item is part of a postfix if (pair.isPostfix()) { // we append it to the last itemset of the prefix newPrefix = appendItemToPrefixOfSequence(prefix, pair.getItem()); } else { // else, we append it as a new itemset to the sequence newPrefix = appendItemToSequence(prefix, pair.getItem()); } // build the projected database with this item List<PseudoSequence> projectedDB = buildProjectedContext(pair.getItem(), database, pair.isPostfix()); newPrefix.setSequencesID(pair.getSequencesID()); // save the pattern savePattern(newPrefix); // make a recursive call recursion(newPrefix, projectedDB); } } MemoryLogger.getInstance().checkMemory(); }
/** * Method to run the algorithm * * @param input path to an input file * @param outputFilePath path for writing the output file * @param minsupRel the minimum support as a relative value * @throws IOException exception if error while writing the file or reading */ public List<TreeSet<PatternVMSP>> runAlgorithm( String input, String outputFilePath, double minsupRel) throws IOException { Bitmap.INTERSECTION_COUNT = 0; // create an object to write the file writer = new BufferedWriter(new FileWriter(outputFilePath)); // initialize the number of patterns found patternCount = 0; // to log the memory used MemoryLogger.getInstance().reset(); // record start time startTime = System.currentTimeMillis(); // RUN THE ALGORITHM vmsp(input, minsupRel); // record end time endTime = System.currentTimeMillis(); // save result to the file writeResultTofile(outputFilePath); // close the file writer.close(); // PRINT PATTTERNS FOUND // for(TreeSet<Pattern> tree : maxPatterns) { // if(tree == null) { // continue; // } // for(Pattern pat : tree) { //// System.out.println(" " + pat.prefix); // } // } return maxPatterns; }
/** Print statistics about the last algorithm execution to System.out. */ public void printStats() { System.out.println("============= TRULEGROWTH - STATS ============="); // System.out.println("minsup: " + minsuppRelative); System.out.println("Sequential rules count: " + ruleCount); System.out.println("Total time : " + (timeEnd - timeStart) + " ms"); System.out.println("Max memory (mb)" + MemoryLogger.getInstance().getMaxMemory()); System.out.println("====================================="); }
/** Print statistics about the last algorithm execution. */ public void printStats() { System.out.println("============= TOP-K RULES - STATS ============="); System.out.println("Minsup : " + minsuppRelative); System.out.println("Rules count: " + kRules.size()); System.out.println("Memory : " + MemoryLogger.getInstance().getMaxMemory() + " mb"); System.out.println("Total time : " + (timeEnd - timeStart) + " ms"); System.out.println("==================================================="); }
/** Print statistics about the algorithm execution to System.out. */ public void printStats() { System.out.println("============= FP-GROWTH 0.96r14 - STATS ============="); long temps = endTime - startTimestamp; System.out.println(" Transactions count from database : " + transactionCount); System.out.print(" Max memory usage: " + MemoryLogger.getInstance().getMaxMemory() + " mb \n"); System.out.println(" Frequent itemsets count : " + itemsetCount); System.out.println(" Total time ~ " + temps + " ms"); System.out.println("==================================================="); }
/** Print statistics about the algorithm execution to System.out. */ public void printStats() { System.out.println("============= DECLAT vALTERNATE-Bitset v0.96r4- STATS ============="); long temps = endTime - startTimestamp; System.out.println(" Transactions count from database : " + database.size()); System.out.println(" Frequent itemsets count : " + itemsetCount); System.out.println(" Total time ~ " + temps + " ms"); System.out.println( " Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb"); System.out.println("==================================================="); }
/** Print statistics about the algorithm execution to System.out. */ public void printStats() { System.out.println("============= APRIORI - STATS ============="); System.out.println(" Candidates count : " + totalCandidateCount); System.out.println( " The algorithm stopped at size " + (k - 1) + ", because there is no candidate"); System.out.println(" Frequent itemsets count : " + itemsetCount); System.out.println( " Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb"); System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms"); System.out.println("==================================================="); }
/** * Register a given rule in the set of candidates for future expansions * * @param expandLR if true the rule will be considered for left/right expansions otherwise only * right. * @param rule the given rule */ private void registerAsCandidate(boolean expandLR, RuleG rule) { // add the rule to candidates rule.expandLR = expandLR; candidates.add(rule); // record the maximum number of candidates for statistics if (candidates.size() >= maxCandidateCount) { maxCandidateCount = candidates.size(); } // check the memory usage MemoryLogger.getInstance().checkMemory(); }
/** * This is the recursive method to find all high utility itemsets. It writes the itemsets to the * output file. * * @param prefix This is the current prefix. Initially, it is empty. * @param pUL This is the Utility List of the prefix. Initially, it is empty. * @param ULs The utility lists corresponding to each extension of the prefix. * @param minUtility The minUtility threshold. * @param prefixLength The current prefix length * @throws IOException */ private void fhm( int[] prefix, int prefixLength, UtilityList pUL, List<UtilityList> ULs, int minUtility) throws IOException { // For each extension X of prefix P for (int i = 0; i < ULs.size(); i++) { UtilityList X = ULs.get(i); // If pX is a high utility itemset. // we save the itemset: pX if (X.sumIutils >= minUtility) { // save to file writeOut(prefix, prefixLength, X.item, X.sumIutils); } // If the sum of the remaining utilities for pX // is higher than minUtility, we explore extensions of pX. // (this is the pruning condition) if (X.sumIutils + X.sumRutils >= minUtility) { // This list will contain the utility lists of pX extensions. List<UtilityList> exULs = new ArrayList<UtilityList>(); // For each extension of p appearing // after X according to the ascending order for (int j = i + 1; j < ULs.size(); j++) { UtilityList Y = ULs.get(j); // ======================== NEW OPTIMIZATION USED IN FHM Map<Integer, Long> mapTWUF = mapFMAP.get(X.item); if (mapTWUF != null) { Long twuF = mapTWUF.get(Y.item); if (twuF != null && twuF < minUtility) { continue; } } candidateCount++; // =========================== END OF NEW OPTIMIZATION // we construct the extension pXY // and add it to the list of extensions of pX UtilityList temp = construct(pUL, X, Y, minUtility); if (temp != null) { exULs.add(temp); } } // We create new prefix pX itemsetBuffer[prefixLength] = X.item; // We make a recursive call to discover all itemsets with the prefix pXY fhm(itemsetBuffer, prefixLength + 1, X, exULs, minUtility); } } MemoryLogger.getInstance().checkMemory(); }
/** * Print statistics about the algorithm execution to System.out. * * @param size the size of the database */ public void printStatistics(int size) { StringBuffer r = new StringBuffer(200); r.append("============= Algorithm BIDE2 - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Closed sequential pattern count : "); r.append(patternCount); r.append('\n'); r.append(" Max memory (mb):"); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append('\n'); r.append("===================================================\n"); System.out.println(r.toString()); }
/** Print statistics about the algorithm execution time */ public void printStatistics() { StringBuilder r = new StringBuilder(200); r.append("============= LAPIN - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Frequent sequences count : " + patternCount); r.append('\n'); r.append(" Max memory (mb) : "); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append(patternCount); r.append('\n'); r.append("==================================================="); System.out.println(r.toString()); }
public String printStatistics() { StringBuilder sb = new StringBuilder(200); sb.append("============= Algorithm - STATISTICS =============\n Total time ~ "); sb.append(getRunningTime()); sb.append(" ms\n"); sb.append(" Frequent sequences count : "); sb.append(numberOfFrequentPatterns); sb.append('\n'); sb.append(" Max memory (mb):"); sb.append(MemoryLogger.getInstance().getMaxMemory()); sb.append('\n'); sb.append(saver.print()); sb.append("\n===================================================\n"); return sb.toString(); }
/** * Main method to run the algorithm * * @param input an input file path * @param outputFilePath an output file path * @param minsupRel the minimum support threshold as a percentage * @throws IOException exception when writting result to a file */ public void runAlgorithm(String input, String outputFilePath, double minsupRel) throws IOException { this.input = input; // prepare file writer for saving result to file writer = new BufferedWriter(new FileWriter(outputFilePath)); patternCount = 0; // reset tool to calculate max. memory usage MemoryLogger.getInstance().reset(); startTime = System.currentTimeMillis(); // launch the algorithm! lapin(input, minsupRel); endTime = System.currentTimeMillis(); writer.close(); }
/** Print the statistics of the algorithm execution to System.out. */ public void printStatistics() { StringBuilder r = new StringBuilder(200); r.append("============= Algorithm - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Frequent sequences count : " + patternCount); r.append('\n'); r.append(" Max memory (mb) : "); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append(patternCount); r.append('\n'); r.append("minsup " + minsup); r.append('\n'); r.append("Intersection count " + Bitmap.INTERSECTION_COUNT + " \n"); r.append("===================================================\n"); System.out.println(r.toString()); }
/** * Print statistics about the algorithm execution to System.out. * * @param size the size of the database */ public void printStatistics(int size) { StringBuilder r = new StringBuilder(200); r.append("============= Algorithm MaxSP - STATISTICS =============\n Total time ~ "); r.append(endTime - startTime); r.append(" ms\n"); r.append(" Maximal sequential pattern count : "); r.append(patternCount); r.append('\n'); r.append(" Max memory (mb):"); r.append(MemoryLogger.getInstance().getMaxMemory()); r.append('\n'); r.append("===================================================\n"); System.out.println(r.toString()); // System.out.println("Frequent pairs time : " + debugFrequentPairsTime); // System.out.println("Generate ith period time: " + debugithPeriodTime); // System.out.println("Add pair time: " + debugAddPairTime); // System.out.println("Project DB time: " + debugProjectDBTime); }
/** * Run the algorithm * * @param database : a sequence database * @param minsup : the minimum support as an integer * @param outputFilePath : the path of the output file to save the result or null if you want the * result to be saved into memory * @return return the result, if saved into memory, otherwise null * @throws IOException exception if error while writing the file */ public SequentialPatterns runAlgorithm( SequenceDatabase database, String outputFilePath, int minsup) throws IOException { // initialize variables for statistics patternCount = 0; MemoryLogger.getInstance().reset(); // to check the memory usage // keep the minimum support because we will need it this.minsuppAbsolute = minsup; // save the start time startTime = System.currentTimeMillis(); // run the algorithm prefixSpan(database, outputFilePath); // save the end time endTime = System.currentTimeMillis(); // close the output file if the result was saved to a file if (writer != null) { writer.close(); } return patterns; }
/** * Method to run the algorithm * * @param input path to an input file * @param outputFilePath path for writing the output file * @param minsupRel the minimum support as a relative value * @param outputSequenceIdentifiers if true, sequence ids will be shown with each output pattern * @throws IOException exception if error while writing the file or reading */ public void runAlgorithm( String input, String outputFilePath, double minsupRel, boolean outputSequenceIdentifiers) throws IOException { this.outputSequenceIdentifiers = outputSequenceIdentifiers; Bitmap.INTERSECTION_COUNT = 0; // create an object to write the file writer = new BufferedWriter(new FileWriter(outputFilePath)); // initialize the number of patterns found patternCount = 0; // to log the memory used MemoryLogger.getInstance().reset(); // record start time startTime = System.currentTimeMillis(); // RUN THE ALGORITHM spam(input, minsupRel); // record end time endTime = System.currentTimeMillis(); // close the file writer.close(); }
/** * Print statistics about the latest execution to System.out. * * @throws IOException */ public void printStats() throws IOException { System.out.println("============= FHM ALGORITHM v0.96r18 - STATS ============="); System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms"); System.out.println(" Memory ~ " + MemoryLogger.getInstance().getMaxMemory() + " MB"); System.out.println(" High-utility itemsets count : " + huiCount); System.out.println(" Candidate count : " + candidateCount); if (DEBUG) { int pairCount = 0; double maxMemory = getObjectSize(mapFMAP); for (Entry<Integer, Map<Integer, Long>> entry : mapFMAP.entrySet()) { maxMemory += getObjectSize(entry.getKey()); for (Entry<Integer, Long> entry2 : entry.getValue().entrySet()) { pairCount++; maxMemory += getObjectSize(entry2.getKey()) + getObjectSize(entry2.getValue()); } } System.out.println("CMAP size " + maxMemory + " MB"); System.out.println("PAIR COUNT " + pairCount); } System.out.println("==================================================="); }
/** * Run the algorithm * * @param database a sequence database * @param outputPath an output file path * @param minsup a minimum support as an integer representing a number of sequences * @return return the result, if saved into memory, otherwise null * @throws IOException exception if error while writing the file */ public SequentialPatterns runAlgorithm(SequenceDatabase database, String outputPath, int minsup) throws IOException { // save minsup this.minsuppAbsolute = minsup; // reset the counter for the number of patterns found patternCount = 0; // reset the stats about memory usage MemoryLogger.getInstance().reset(); // save the start time startTime = System.currentTimeMillis(); // start the algorithm bide(database, outputPath); // save the end time endTime = System.currentTimeMillis(); // close the output file if the result was saved to a file if (writer != null) { writer.close(); } return patterns; }
/** * Method to find all frequent items in a projected sequence database * * @param sequences the set of sequences * @return A list of pairs, where a pair is an item with (1) booleans indicating if it is in an * itemset that is "cut" at left or right (prefix or postfix) and (2) the sequence IDs where * it occurs. */ protected Set<PairBIDE> findAllFrequentPairs( SequentialPattern prefix, List<PseudoSequenceBIDE> sequences) { // We use a Map the store the pairs. Map<PairBIDE, PairBIDE> mapPairs = new HashMap<PairBIDE, PairBIDE>(); // for each sequence for (PseudoSequenceBIDE sequence : sequences) { // for each itemset for (int i = 0; i < sequence.size(); i++) { // for each item for (int j = 0; j < sequence.getSizeOfItemsetAt(i); j++) { Integer item = sequence.getItemAtInItemsetAt(j, i); // create the pair corresponding to this item PairBIDE pair = new PairBIDE(sequence.isCutAtRight(i), sequence.isPostfix(i), item); // register this sequenceID for that pair. addPairWithoutCheck(mapPairs, sequence.getId(), pair); } } } // check the memory usage MemoryLogger.getInstance().checkMemory(); return mapPairs.keySet(); // return the pairs. }
/** * The actual method for extracting frequent sequences. * * @param database The original database * @param minSupportAbsolute the absolute minimum support * @param keepPatterns flag indicating if we are interested in keeping the output of the algorithm * @param verbose Flag for debugging purposes */ protected void runSPAM( SequenceDatabase database, long minSupportAbsolute, boolean keepPatterns, boolean verbose) { // We get the equivalence classes formed by the frequent 1-patterns frequentItems = database.frequentItems(); // We extract their patterns Collection<Pattern> size1sequences = getPatterns(frequentItems); // If we want to keep the output if (keepPatterns) { for (Pattern atom : size1sequences) { // We keep all the frequent 1-patterns saver.savePattern(atom); } } database = null; // We define the root class EquivalenceClass rootClass = new EquivalenceClass(null); /*And we insert the equivalence classes corresponding to the frequent 1-patterns as its members*/ for (EquivalenceClass atom : frequentItems) { rootClass.addClassMember(atom); } // Inizialitation of the class that is in charge of find the frequent patterns FrequentPatternEnumeration_SPAM frequentPatternEnumeration = new FrequentPatternEnumeration_SPAM(minSupAbsolute, saver); // We execute the search frequentPatternEnumeration.execute(rootClass, keepPatterns, verbose); // Once we had finished, we keep the number of frequent patterns that we found numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns(); // check the memory usage for statistics MemoryLogger.getInstance().checkMemory(); }
/** * Run the LAPIN algorithm * * @param input the input file path * @param minsupRel the minsup threshold as a percentage */ private void lapin(String input, double minsupRel) throws IOException { if (DEBUG) { System.out.println( "=== First database scan to count number of sequences and support of single items ==="); } // FIRST DATABASE SCAN: SCAN THE DATABASE TO COUNT // - THE NUMBER OF SEQUENCES // - THE SUPPORT OF EACH SINGLE ITEM // - THE LARGEST ITEM ID int sequenceCount = 0; int largestItemID = 0; // This map will store for each item (key) the first position where the item appears in each // sequence where it appears (value) Map<Integer, List<Position>> mapItemFirstOccurrences = new HashMap<Integer, List<Position>>(); try { // Read the input file BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); String thisLine; // for each sequence of the input fiel while ((thisLine = reader.readLine()) != null) { // we use a set to remember which item have been seen already Set<Integer> itemsAlreadySeen = new HashSet<Integer>(); // to know the itemset number short itemsetID = 0; // for each token in this line for (String integer : thisLine.split(" ")) { // if it is the end of an itemset if ("-1".equals(integer)) { itemsetID++; } else if ("-2".equals(integer)) { // if it is the end of line // nothing to do here } else { // otherwise, it is an item Integer item = Integer.valueOf(integer); // if this item was not seen already in that sequence if (itemsAlreadySeen.contains(item) == false) { // Get the list of positions of that item List<Position> list = mapItemFirstOccurrences.get(item); // if that list is null, create a new list if (list == null) { list = new ArrayList<Position>(); mapItemFirstOccurrences.put(item, list); } // Add the position of the item in that sequence to the list of first positions // of that item Position position = new Position(sequenceCount, itemsetID); list.add(position); // Remember that we have seen this item itemsAlreadySeen.add(item); // Check if the item is the largest item until now if (item > largestItemID) { largestItemID = item; } } } } // Increase the count of sequences from the input file sequenceCount++; } reader.close(); } catch (Exception e) { e.printStackTrace(); } ; // Initialize the list of tables tables = new Table[sequenceCount]; // Calculate absolute minimum support as a number of sequences minsup = (int) Math.ceil(minsupRel * sequenceCount); if (minsup == 0) { minsup = 1; } if (DEBUG) { System.out.println("Number of items: " + mapItemFirstOccurrences.size()); System.out.println("Sequence count: " + sequenceCount); System.out.println("Abs. minsup: " + minsup + " sequences"); System.out.println("Rel. minsup: " + minsupRel + " %"); System.out.println("=== Determining the frequent items ==="); } // // For each frequent item, save it and add it to the list of frequent items List<Integer> frequentItems = new ArrayList<Integer>(); for (Entry<Integer, List<Position>> entry : mapItemFirstOccurrences.entrySet()) { // Get the border created by this item List<Position> itemBorder = entry.getValue(); // if the item is frequent if (itemBorder.size() >= minsup) { // Output the item and add it to the list of frequent items Integer item = entry.getKey(); savePattern(item, itemBorder.size()); frequentItems.add(item); if (DEBUG) { System.out.println(" Item " + item + " is frequent with support = " + itemBorder.size()); } } } if (DEBUG) { System.out.println("=== Second database scan to construct item-is-exist tables ==="); } // sort the frequent items (useful when generating 2-IE-sequences, later on). Collections.sort(frequentItems); // SECOND DATABASE SCAN: // Now we will read the database again to create the Item-is-exist-table // and SE-position-lists and count support of 2-IE-sequences matrixPairCount = new SparseTriangularMatrix(largestItemID + 1); // Initialise the IE position lists and SE position lists sePositionList = new SEPositionList[sequenceCount]; iePositionList = new IEPositionList[sequenceCount]; try { // Prepare to read the file BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); String thisLine; // For each sequence in the file int currentSequenceID = 0; while ((thisLine = reader.readLine()) != null) { // (1) ------- PARSE THE SEQUENCE BACKWARD TO CREATE THE ITEM-IS-EXIST TABLE FOR THATS // SEQUENCE // AND COUNT THE SUPPORT OF 2-IE-Sequences // We will also use a structure to remember in which sequence we have seen each pair of // items // Note that in this structure, we will add +1 to the sid because by default the matrix is // filled with 0 // and we don't want to think that the first sequence was already seen for all pairs. AbstractTriangularMatrix matrixPairLastSeenInSID = new SparseTriangularMatrix(largestItemID + 1); // We count the number of positions (number of itemsets). // To do that we count the number of "-" symbols in the file. // We need to subtract 1 because the end of line "-2" contains "-". int positionCount = -1; for (char caracter : thisLine.toCharArray()) { if (caracter == '-') { positionCount++; } } // Now we will scan the sequence again. // This time we will remember which item were seen already Set<Integer> itemsAlreadySeen = new HashSet<Integer>(); // During this scan, we will create the table for this sequence Table table = new Table(); // To do that, we first create an initial position vector for that table BitSet currentBitset = new BitSet(mapItemFirstOccurrences.size()); // OK ? // This variable will be used to remember if a new item appeared in the current itemset boolean seenNewItem = false; // We will scan the sequence backward, starting from the end because // we should not create a bit vector for all positions but for only // the positions that are different from the previous one. String[] tokens = thisLine.split(" "); // This is the number of itemsets int currentPosition = positionCount; // to keep the current itemset in memory List<Integer> currentItemset = new ArrayList<Integer>(); // For each token in that sequence for (int i = tokens.length - 1; i >= 0; i--) { // get the token String token = tokens[i]; // if we reached the end of an itemset if ("-1".equals(token)) { // update the triangular matrix for counting 2-IE-sequences // by comparing each pairs of items in the current itemset for (int k = 0; k < currentItemset.size(); k++) { Integer item1 = currentItemset.get(k); for (int m = k + 1; m < currentItemset.size(); m++) { Integer item2 = currentItemset.get(m); // if that pair is frequent int sid = matrixPairLastSeenInSID.getSupportForItems(item1, item2); // and if we have not seen this sequence yet if (sid != currentSequenceID + 1) { // increment support count of this pair matrixPairCount.incrementCount(item1, item2); // remember that we have seen this pair so that we don't count it again matrixPairLastSeenInSID.setSupport(item1, item2, currentSequenceID + 1); } } } currentItemset.clear(); // Decrease the current index of the position (itemset) in the sequence currentPosition--; // if the bit vector has changed since previous position, then // we need to add a new bit vector to the table if (seenNewItem) { // create the position vector and add it to the item-is-exist table PositionVector vector = new PositionVector(currentPosition, (BitSet) currentBitset.clone()); table.add(vector); } } else if ("-2".equals(token)) { // if end of sequence, nothing to do } else { // otherwise, it is an item Integer item = Integer.valueOf(token); if (mapItemFirstOccurrences.get(item).size() >= minsup) { // only for frequent items // if first time that we see this item if (itemsAlreadySeen.contains(item) == false) { // remember that we have seen a new item seenNewItem = true; // remember that we have seen this item itemsAlreadySeen.add(item); // add this item to the current bit vector currentBitset.set(item); } // add this item to the current itemset currentItemset.add(item); } } } // Lastly, // update the triangular matrix for counting 2-IE-sequences one more time // for the case where the pair is in first position of the sequence // by considering each pair of items in the last itemset. // This is done like it was done above, so I will not comment this part of the code again. for (int k = 0; k < currentItemset.size(); k++) { Integer item1 = currentItemset.get(k); for (int m = k + 1; m < currentItemset.size(); m++) { Integer item2 = currentItemset.get(m); // if th int sid = matrixPairLastSeenInSID.getSupportForItems(item1, item2); if (sid != currentSequenceID + 1) { matrixPairCount.incrementCount(item1, item2); matrixPairLastSeenInSID.setSupport(item1, item2, currentSequenceID + 1); } } } // If a new item was seen // Add an extra row to the item-is-exist table that will be called -1 with all items in // this sequence if (seenNewItem) { PositionVector vector = new PositionVector(-1, (BitSet) currentBitset.clone()); table.add(vector); } // // // // Initialize the IE lists and SE lists for that sequence // which will be filled with the next database scan. sePositionList[currentSequenceID] = new SEPositionList(itemsAlreadySeen); iePositionList[currentSequenceID] = new IEPositionList(); if (DEBUG) { System.out.println("Table for sequence " + currentSequenceID + " : " + thisLine); System.out.println(table.toString()); } // put the current table in the array of item-is-exist-tables tables[currentSequenceID] = table; // we will process the next sequence id currentSequenceID++; } reader.close(); } catch (Exception e) { e.printStackTrace(); } // THIRD SCAN TO // PARSE THE SEQUENCE FORWARD TO CREATE THE SE-POSITION LIST OF THAT SEQUENCE // AND IEPositionList for frequent 2-IE-SEQUENCES try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); String thisLine; // For each sequence int currentSequenceID = 0; while ((thisLine = reader.readLine()) != null) { // We will scan the sequence backward, starting from the end. String[] tokens = thisLine.split(" "); // to keep the current itemset in memory List<Integer> currentItemset = new ArrayList<Integer>(); // this variable will be used to remember which itemset we are visiting short itemsetID = 0; // empty the object to track the current itemset (if it was used for the previous sequence) currentItemset.clear(); // for each token of the current sequence for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; // if we reached the end of an itemset if ("-1".equals(token)) { // if the current itemset contains more than one item if (currentItemset.size() > 1) { // update the position list for 2-IE-sequences for (int k = 0; k < currentItemset.size(); k++) { Integer item1 = currentItemset.get(k); for (int m = k + 1; m < currentItemset.size(); m++) { Integer item2 = currentItemset.get(m); // if the pair is frequent int support = matrixPairCount.getSupportForItems(item1, item2); if (support >= minsup) { iePositionList[currentSequenceID].register(item1, item2, itemsetID); } } } } // increase itemsetID itemsetID++; // clear itemset currentItemset.clear(); } else if ("-2".equals(token)) { // if the end of a sequence, nothing special to do } else { // otherwise, the current token is an item Integer item = Integer.valueOf(token); // if the item is frequent if (mapItemFirstOccurrences.get(item).size() >= minsup) { // we add the current position to the item SE-position list sePositionList[currentSequenceID].register(item, itemsetID); // we add the item to the current itemset currentItemset.add(item); } } } if (DEBUG) { System.out.println("SE Position list for sequence " + currentSequenceID); System.out.println(sePositionList[currentSequenceID]); System.out.println("IE Position list for sequence " + currentSequenceID); System.out.println(iePositionList[currentSequenceID]); } iePositionList[currentSequenceID].sort(); // sort the IE-position list // update the sequence id for the next sequence currentSequenceID++; } reader.close(); } catch (Exception e) { e.printStackTrace(); } if (DEBUG) { System.out.println("=== Starting sequential pattern generation ==="); } // For each frequent item, call the recursive method to explore larger patterns for (int i = 0; i < frequentItems.size(); i++) { // Get the item int item1 = frequentItems.get(i); // Get the border for that item List<Position> item1Border = mapItemFirstOccurrences.get(item1); if (DEBUG) { System.out.println("=== Considering item " + item1); System.out.println(" Border of " + item1); for (Position pos : item1Border) { System.out.println(" seq: " + pos.sid + " itemset: " + pos.position); } } // if the border contains at least minsup sequence (if the item is frequent) if (item1Border.size() >= minsup) { // Create an object prefix to represent the sequential pattern containing the item Prefix prefix = new Prefix(); List<Integer> itemset = new ArrayList<Integer>(1); itemset.add(item1); prefix.itemsets.add(itemset); // make a recursive call to find s-extensions of this prefix genPatterns( prefix, item1Border, frequentItems, frequentItems, item1, true); // true, to disallow I-extension because we explore 2-IE sequences separately } // For each frequent 2-IE sequences stating with item1, we will explore 2-IE sequences // by considering each frequent item larger than item1 for (int k = i + 1; k < frequentItems.size(); k++) { // We consider item2 int item2 = frequentItems.get(k); // Get the support of item1, item2 int support = matrixPairCount.getSupportForItems(item1, item2); // if the pair {item1, item2} is frequent if (support >= minsup) { // get the list of position of item2 List<Position> item2Border = mapItemFirstOccurrences.get(item2); // Create the border by using the 2-IE position list List<Position> ie12Border = new ArrayList<Position>(); // We will loop over the border of item1 or item2 (the smallest one) List<Position> borderToUse; if (item2Border.size() < item1Border.size()) { borderToUse = item2Border; } else { borderToUse = item1Border; } // For each sequence of the border that we consider for (Position sequenceToUse : borderToUse) { // Get the sequence id int sid = sequenceToUse.sid; // For this sequence, we will get the position list of each item List<Short> listPosition1 = sePositionList[sid].getListForItem(item1); List<Short> listPosition2 = sePositionList[sid].getListForItem(item2); // if one of them is null, that means that both item1 and item2 do not appear in that // sequence // so we continue to the next sequence if (listPosition1 == null || listPosition2 == null) { continue; } // otherwise // find the first common position of item1 and item2 in the sequence int index1 = 0; int index2 = 0; // we do that by the following while loop while (index1 < listPosition1.size() && index2 < listPosition2.size()) { short position1 = listPosition1.get(index1); short position2 = listPosition2.get(index2); if (position1 < position2) { index1++; } else if (position1 > position2) { index2++; } else { // we have found the position, so we add it to the new border and // then stop because we do not want to add more than one position for // the same sequence in the new border ie12Border.add(new Position(sid, position1)); break; } } } if (DEBUG) { System.out.println( "=== Considering the 2-IE sequence {" + item1 + "," + item2 + "} with support " + support); System.out.println(" Border of {" + item1 + "," + item2 + "}"); for (Position pos : ie12Border) { System.out.println(" seq: " + pos.sid + " itemset: " + pos.position); } } // finally, we create the prefix for the pattern {item1, item2} Prefix prefix = new Prefix(); List<Integer> itemset = new ArrayList<Integer>(2); itemset.add(item1); itemset.add(item2); prefix.itemsets.add(itemset); // save the pattern savePattern(prefix, support); // perform recursive call to extend that pattern genPatterns( prefix, ie12Border, frequentItems, frequentItems, item2, false); // false, to allow I-extension } } } // Record the maximum memory usage MemoryLogger.getInstance().checkMemory(); writer.close(); }
/** * This is the dfsPruning method as described in the SPAM paper. * * @param prefix the current prefix * @param prefixBitmap the bitmap corresponding to the current prefix * @param sn a list of items to be considered for i-steps * @param in a list of items to be considered for s-steps * @param hasToBeGreaterThanForIStep * @param m size of the current prefix in terms of items * @param lastAppendedItem the last appended item to the prefix * @throws IOException if there is an error writing a pattern to the output file * @return TRUE IF A FREQUENT PATTERN WAS CREATED USING THE PREFIX. */ boolean dfsPruning( PrefixVMSP prefix, Bitmap prefixBitmap, List<Integer> sn, List<Integer> in, int hasToBeGreaterThanForIStep, int m, Integer lastAppendedItem) throws IOException { boolean atLeastOneFrequentExtension = false; // System.out.println(prefix.toString()); // ====== S-STEPS ====== // Temporary variables (as described in the paper) List<Integer> sTemp = new ArrayList<Integer>(); List<Bitmap> sTempBitmaps = new ArrayList<Bitmap>(); // for CMAP pruning, we will only check against the last appended item Map<Integer, Integer> mapSupportItemsAfter = coocMapAfter.get(lastAppendedItem); // for each item in sn loopi: for (Integer i : sn) { // LAST POSITION PRUNING /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) { // System.out.println("TEST"); continue loopi; }*/ // CMAP PRUNING // we only check with the last appended item if (useCMAPPruning) { if (mapSupportItemsAfter == null) { continue loopi; } Integer support = mapSupportItemsAfter.get(i); if (support == null || support < minsup) { // System.out.println("PRUNE"); continue loopi; } } // perform the S-STEP with that item to get a new bitmap Bitmap.INTERSECTION_COUNT++; Bitmap newBitmap = prefixBitmap.createNewBitmapSStep(verticalDB.get(i), sequencesSize, lastBitIndex, maxGap); // if the support is higher than minsup if (newBitmap.getSupportWithoutGapTotal() >= minsup) { // record that item and pattern in temporary variables sTemp.add(i); sTempBitmaps.add(newBitmap); } } // for each pattern recorded for the s-step for (int k = 0; k < sTemp.size(); k++) { // STRATEGY: NEWWW atLeastOneFrequentExtension = true; int item = sTemp.get(k); // create the new prefix PrefixVMSP prefixSStep = prefix.cloneSequence(); prefixSStep.addItemset(new Itemset(item)); if (item % 2 == 0) { prefixSStep.sumOfEvenItems = item + prefix.sumOfEvenItems; prefixSStep.sumOfOddItems = prefix.sumOfOddItems; } else { prefixSStep.sumOfEvenItems = prefix.sumOfEvenItems; prefixSStep.sumOfOddItems = item + prefix.sumOfOddItems; } // prefixSStep.sumOfItems = item + prefix.sumOfItems; // create the new bitmap Bitmap newBitmap = sTempBitmaps.get(k); // save the pattern to the file if (newBitmap.getSupport() >= minsup) { boolean hasFrequentExtension = false; // recursively try to extend that pattern if (maximumPatternLength > m) { hasFrequentExtension = dfsPruning(prefixSStep, newBitmap, sTemp, sTemp, item, m + 1, item); } if (hasFrequentExtension == false) { savePatternMultipleItems(prefixSStep, newBitmap, m); } } } Map<Integer, Integer> mapSupportItemsEquals = coocMapEquals.get(lastAppendedItem); // ======== I STEPS ======= // Temporary variables List<Integer> iTemp = new ArrayList<Integer>(); List<Bitmap> iTempBitmaps = new ArrayList<Bitmap>(); // for each item in in loop2: for (Integer i : in) { // the item has to be greater than the largest item // already in the last itemset of prefix. if (i > hasToBeGreaterThanForIStep) { // LAST POSITION PRUNING /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) { continue loop2; }*/ // CMAP PRUNING if (useCMAPPruning) { if (mapSupportItemsEquals == null) { continue loop2; } Integer support = mapSupportItemsEquals.get(i); if (support == null || support < minsup) { continue loop2; } } // Perform an i-step with this item and the current prefix. // This creates a new bitmap Bitmap.INTERSECTION_COUNT++; Bitmap newBitmap = prefixBitmap.createNewBitmapIStep(verticalDB.get(i), sequencesSize, lastBitIndex); // If the support is no less than minsup if (newBitmap.getSupport() >= minsup) { // record that item and pattern in temporary variables iTemp.add(i); iTempBitmaps.add(newBitmap); } } } // for each pattern recorded for the i-step for (int k = 0; k < iTemp.size(); k++) { // STRATEGY: NEWWW atLeastOneFrequentExtension = true; int item = iTemp.get(k); // create the new prefix PrefixVMSP prefixIStep = prefix.cloneSequence(); prefixIStep.getItemsets().get(prefixIStep.size() - 1).addItem(item); if (item % 2 == 0) { prefixIStep.sumOfEvenItems = item + prefix.sumOfEvenItems; prefixIStep.sumOfOddItems = prefix.sumOfOddItems; } else { prefixIStep.sumOfEvenItems = prefix.sumOfEvenItems; prefixIStep.sumOfOddItems = item + prefix.sumOfOddItems; } // create the new bitmap Bitmap newBitmap = iTempBitmaps.get(k); // recursively try to extend that pattern boolean hasFrequentExtension = false; if (maximumPatternLength > m) { hasFrequentExtension = dfsPruning(prefixIStep, newBitmap, sTemp, iTemp, item, m + 1, item); } if (hasFrequentExtension == false) { // save the pattern savePatternMultipleItems(prefixIStep, newBitmap, m); } } // check the memory usage MemoryLogger.getInstance().checkMemory(); return atLeastOneFrequentExtension || useStrategyForwardExtensionChecking == false; }
/** * Run the algorithm * * @param input the input file path * @param output the output file path * @param minUtility the minimum utility threshold * @throws IOException exception if error while writing the file */ public void runAlgorithm(String input, String output, int minUtility) throws IOException { // reset maximum MemoryLogger.getInstance().reset(); // initialize the buffer for storing the current itemset itemsetBuffer = new int[BUFFERS_SIZE]; mapFMAP = new HashMap<Integer, Map<Integer, Long>>(); startTimestamp = System.currentTimeMillis(); writer = new BufferedWriter(new FileWriter(output)); // We create a map to store the TWU of each item mapItemToTWU = new HashMap<Integer, Long>(); // We scan the database a first time to calculate the TWU of each item. BufferedReader myInput = null; String thisLine; try { // prepare the object for reading the file myInput = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); // for each line (transaction) until the end of file while ((thisLine = myInput.readLine()) != null) { // if the line is a comment, is empty or is a // kind of metadata if (thisLine.isEmpty() == true || thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%' || thisLine.charAt(0) == '@') { continue; } // split the transaction according to the : separator String split[] = thisLine.split(":"); // the first part is the list of items String items[] = split[0].split(" "); // the second part is the transaction utility int transactionUtility = Integer.parseInt(split[1]); // for each item, we add the transaction utility to its TWU for (int i = 0; i < items.length; i++) { // convert item to integer Integer item = Integer.parseInt(items[i]); // get the current TWU of that item Long twu = mapItemToTWU.get(item); // add the utility of the item in the current transaction to its twu twu = (twu == null) ? transactionUtility : twu + transactionUtility; mapItemToTWU.put(item, twu); } } } catch (Exception e) { // catches exception if error while reading the input file e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } // CREATE A LIST TO STORE THE UTILITY LIST OF ITEMS WITH TWU >= MIN_UTILITY. List<UtilityList> listOfUtilityLists = new ArrayList<UtilityList>(); // CREATE A MAP TO STORE THE UTILITY LIST FOR EACH ITEM. // Key : item Value : utility list associated to that item Map<Integer, UtilityList> mapItemToUtilityList = new HashMap<Integer, UtilityList>(); // For each item for (Integer item : mapItemToTWU.keySet()) { // if the item is promising (TWU >= minutility) if (mapItemToTWU.get(item) >= minUtility) { // create an empty Utility List that we will fill later. UtilityList uList = new UtilityList(item); mapItemToUtilityList.put(item, uList); // add the item to the list of high TWU items listOfUtilityLists.add(uList); } } // SORT THE LIST OF HIGH TWU ITEMS IN ASCENDING ORDER Collections.sort( listOfUtilityLists, new Comparator<UtilityList>() { public int compare(UtilityList o1, UtilityList o2) { // compare the TWU of the items return compareItems(o1.item, o2.item); } }); // SECOND DATABASE PASS TO CONSTRUCT THE UTILITY LISTS // OF 1-ITEMSETS HAVING TWU >= minutil (promising items) try { // prepare object for reading the file myInput = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input)))); // variable to count the number of transaction int tid = 0; // for each line (transaction) until the end of file while ((thisLine = myInput.readLine()) != null) { // if the line is a comment, is empty or is a // kind of metadata if (thisLine.isEmpty() == true || thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%' || thisLine.charAt(0) == '@') { continue; } // split the line according to the separator String split[] = thisLine.split(":"); // get the list of items String items[] = split[0].split(" "); // get the list of utility values corresponding to each item // for that transaction String utilityValues[] = split[2].split(" "); // Copy the transaction into lists but // without items with TWU < minutility int remainingUtility = 0; long newTWU = 0; // NEW OPTIMIZATION // Create a list to store items List<Pair> revisedTransaction = new ArrayList<Pair>(); // for each item for (int i = 0; i < items.length; i++) { /// convert values to integers Pair pair = new Pair(); pair.item = Integer.parseInt(items[i]); pair.utility = Integer.parseInt(utilityValues[i]); // if the item has enough utility if (mapItemToTWU.get(pair.item) >= minUtility) { // add it revisedTransaction.add(pair); remainingUtility += pair.utility; newTWU += pair.utility; // NEW OPTIMIZATION } } // sort the transaction Collections.sort( revisedTransaction, new Comparator<Pair>() { public int compare(Pair o1, Pair o2) { return compareItems(o1.item, o2.item); } }); // for each item left in the transaction for (int i = 0; i < revisedTransaction.size(); i++) { Pair pair = revisedTransaction.get(i); // int remain = remainingUtility; // FOR OPTIMIZATION // subtract the utility of this item from the remaining utility remainingUtility = remainingUtility - pair.utility; // get the utility list of this item UtilityList utilityListOfItem = mapItemToUtilityList.get(pair.item); // Add a new Element to the utility list of this item corresponding to this transaction Element element = new Element(tid, pair.utility, remainingUtility); utilityListOfItem.addElement(element); // BEGIN NEW OPTIMIZATION for FHM Map<Integer, Long> mapFMAPItem = mapFMAP.get(pair.item); if (mapFMAPItem == null) { mapFMAPItem = new HashMap<Integer, Long>(); mapFMAP.put(pair.item, mapFMAPItem); } for (int j = i + 1; j < revisedTransaction.size(); j++) { Pair pairAfter = revisedTransaction.get(j); Long twuSum = mapFMAPItem.get(pairAfter.item); if (twuSum == null) { mapFMAPItem.put(pairAfter.item, newTWU); } else { mapFMAPItem.put(pairAfter.item, twuSum + newTWU); } } // END OPTIMIZATION of FHM } tid++; // increase tid number for next transaction } } catch (Exception e) { // to catch error while reading the input file e.printStackTrace(); } finally { if (myInput != null) { myInput.close(); } } // check the memory usage MemoryLogger.getInstance().checkMemory(); // Mine the database recursively fhm(itemsetBuffer, 0, null, listOfUtilityLists, minUtility); // check the memory usage again and close the file. MemoryLogger.getInstance().checkMemory(); // close output file writer.close(); // record end time endTimestamp = System.currentTimeMillis(); }
/** * This is the main method for the BIDE+ algorithm. * * @param database a sequence database * @throws IOException exception if some error occurs while writing the output file. */ private void bide(SequenceDatabase database, String outputFilePath) throws IOException { // if the user want to keep the result into memory if (outputFilePath == null) { writer = null; patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS"); } else { // if the user want to save the result to a file patterns = null; writer = new BufferedWriter(new FileWriter(outputFilePath)); } // The algorithm first scan the database to find all frequent items // The algorithm note the sequences in which these items appear. // This is stored in a map: Key: item Value : IDs of sequences containing the item Map<Integer, Set<Integer>> mapSequenceID = findSequencesContainingItems(database); // WE CONVERT THE DATABASE TO A PSEUDO-DATABASE, AND REMOVE // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM // WILL NOT CONSIDER THEM ANYMORE. // OPTIMIZATION Create COOC MAP // coocMapBefore = new HashMap<Integer, Map<Integer, // Integer>>(mapSequenceID.entrySet().size()); // we create a database initialDatabase = new ArrayList<PseudoSequenceBIDE>(); // for each sequence of the original database for (Sequence sequence : database.getSequences()) { // we make a copy of the sequence while removing infrequent items Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute); if (optimizedSequence.size() != 0) { // if this sequence has size >0, we add it to the new database initialDatabase.add(new PseudoSequenceBIDE(optimizedSequence, 0, 0)); } // // update COOC map // HashSet<Integer> alreadySeen = new HashSet<Integer>(); // for(List<Integer> itemset : optimizedSequence.getItemsets()) { // for(Integer item : itemset) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(item); // if(mapCoocItem == null) { // mapCoocItem = new HashMap<Integer, Integer>(); // coocMapBefore.put(item, mapCoocItem); // } // for(Integer itemSeen : alreadySeen) { // if(itemSeen != item) { // Integer frequency = mapCoocItem.get(itemSeen); // if(frequency == null) { // mapCoocItem.put(itemSeen, 1); // }else { // mapCoocItem.put(itemSeen, frequency+1); // } // } // } // alreadySeen.add(item); // } // } } // For each frequent item loop1: for (Entry<Integer, Set<Integer>> entry : mapSequenceID.entrySet()) { // if the item is frequent if (entry.getValue().size() >= minsuppAbsolute) { // Map<Integer, Integer> mapCoocItem = coocMapBefore.get(entry.getKey()); // if(mapCoocItem != null) { // for(Integer supportCoocBefore : mapCoocItem.values()) { // if(supportCoocBefore >= entry.getValue().size()) { // continue loop1; // } // } // } // build the projected database with this item Integer item = entry.getKey(); List<PseudoSequenceBIDE> projectedContext = buildProjectedContextSingleItem(item, initialDatabase, false, entry.getValue()); // Create the prefix with this item SequentialPattern prefix = new SequentialPattern(); prefix.addItemset(new Itemset(item)); // set the sequence IDS of this prefix prefix.setSequenceIDs(entry.getValue()); // variable to store the largest support of patterns // that will be found starting with this prefix if (projectedContext.size() >= minsuppAbsolute) { int successorSupport = 0; if (!checkBackScanPruning(prefix, entry.getValue())) { successorSupport = recursion(prefix, projectedContext); // récursion; } // Finally, because this prefix has support > minsup // and passed the backscan pruning, // we check if it has no sucessor with support >= minsup // (a forward extension) // IF no forward extension if (successorSupport != entry.getValue().size()) { // ######### MODIFICATION #### // IF there is also no backward extension if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } else { if (!checkBackwardExtension(prefix, entry.getValue())) { // the pattern is closed and we save it savePattern(prefix); } } } } // check the memory usage for statistics MemoryLogger.getInstance().checkMemory(); }
/** * The main recursive method of LAPIN * * @param prefix the current prefix * @param prefix the prefix * @param prefixBorder a list of position that is the prefix border * @param in items that could be appended by i-extension * @param sn items that could be appended by s-extension * @param hasToBeGreaterThanForIStep * @throws IOException if error while writing to file */ private void genPatterns( Prefix prefix, List<Position> prefixBorder, List<Integer> sn, List<Integer> in, int hasToBeGreaterThanForIStep, boolean doNotPerformIExtensions) throws IOException { // if(DEBUG) { // if(seqDB == null) { // seqDB = new SequenceDatabase(); // seqDB.loadFile(input); // } // // FOR DEBUGGING = WORK ONLY FOR SEQUENCE WITH SINGLE ITEMS IN EACH ITEMSET // System.out.println("Checking if the border of " + prefix + " is correct"); // for(Position pos : prefixBorder) { // int sid = pos.sid; // Sequence seq = seqDB.getSequences().get(sid); // int calculatedPosition = 0; // // int prefixItemsetID =0; // for(; calculatedPosition< seq.size(); calculatedPosition++ ) { // Integer[] itemset = seq.get(calculatedPosition); // Integer itemToMatch = prefix.itemsets.get(prefixItemsetID).get(0); // if(itemset[0].equals(itemToMatch)) { // prefixItemsetID++; // if(prefixItemsetID == prefix.size()) { // if(pos.position != calculatedPosition) { // System.out.println("THE BORDER IS WRONG FOR PREFIX " + prefix + " AND SEQUENCE :" + // sid + " " + seq); // System.out.println(); // }else { // System.out.println("THE BORDER IS OK"); // break; // } // } // } // } // } //// } // ====== S-STEPS ====== // // Temporary variables (as described in the paper) List<Integer> sTemp = new ArrayList<Integer>(); List<Integer> sTempSupport = new ArrayList<Integer>(); // // // for each item in sn for (Integer item : sn) { // perform the S-STEP int support = calculateSupportSStep(item, prefixBorder); // if the support is higher than minsup if (support >= minsup) { // // record that item and pattern in temporary variables sTemp.add(item); sTempSupport.add(support); } } // for each pattern recorded for the s-step for (int k = 0; k < sTemp.size(); k++) { int item = sTemp.get(k); // create the new prefix Prefix prefixSStep = prefix.cloneSequence(); List<Integer> itemset = new ArrayList<Integer>(1); itemset.add(item); prefixSStep.itemsets.add(itemset); // save the pattern to the file savePattern(prefixSStep, sTempSupport.get(k)); // recursively try to extend that pattern List<Position> newBorder = recalculateBorderForSExtension(prefixBorder, item); // Recursive call genPatterns(prefixSStep, newBorder, sTemp, sTemp, item, false); } if (doNotPerformIExtensions) { return; } // ======== I STEPS ======= // Temporary variables List<Integer> iTemp = new ArrayList<Integer>(); List<List<Position>> iTempBorder = new ArrayList<List<Position>>(); // // // for each item in in // the item has to be greater than the largest item // already in the last itemset of prefix. int index = Collections.binarySearch(in, hasToBeGreaterThanForIStep); for (int i = index; i < in.size(); i++) { Integer item = in.get(i); List<Integer> lastItemset = prefix.itemsets.get(prefix.itemsets.size() - 1); // Integer lastItem = lastItemset.get(lastItemset.size()-1); boolean willAddSecondItem = lastItemset.size() == 1; // AN OPTIMIZATION // perform the I-STEP int support = estimateSupportIStep(item, prefixBorder); // if the estimated support is higher than minsup if (support >= minsup) { // recalculate the border // in this case, the method takes the prefix border as input List<Position> newBorder = recalculateBorderForIExtension( lastItemset, prefixBorder, hasToBeGreaterThanForIStep, item, willAddSecondItem); // record that item and pattern in temporary variables if (newBorder.size() >= minsup) { iTemp.add(item); iTempBorder.add(newBorder); } } } // for each pattern recorded for the i-step for (int k = 0; k < iTemp.size(); k++) { int item = iTemp.get(k); // create the new prefix Prefix prefixIStep = prefix.cloneSequence(); prefixIStep.itemsets.get(prefixIStep.size() - 1).add(item); // save the pattern List<Position> newBorder = iTempBorder.get(k); savePattern(prefixIStep, newBorder.size()); // recursively try to extend that pattern genPatterns(prefixIStep, newBorder, sTemp, iTemp, item, false); } // check the memory usage MemoryLogger.getInstance().checkMemory(); }