/** * This methods checks if a seq. pattern "pattern2" is strictly contained in a seq. pattern * "pattern1". * * @param pattern1 a sequential pattern * @param pattern2 another sequential pattern * @return true if the pattern1 contains pattern2. */ boolean strictlyContains(PrefixVMSP pattern1, PrefixVMSP pattern2) { // // if pattern2 is larger or equal in size, then it cannot be contained in pattern1 // if(pattern1.size() <= pattern2.size()){ // return false; // } // To see if pattern2 is strictly contained in pattern1, // we will search for each itemset i of pattern2 in pattern1 by advancing // in pattern 1 one itemset at a time. int i = 0; // position in pattern2 int j = 0; // position in pattern1 while (true) { // if the itemset at current position in pattern1 contains the itemset // at current position in pattern2 if (pattern1.get(j).containsAll(pattern2.get(i))) { // go to next itemset in pattern2 i++; // if we reached the end of pattern2, then return true if (i == pattern2.size()) { return true; } } // go to next itemset in pattern1 j++; // if we reached the end of pattern1, then pattern2 is not strictly included // in it, and return false if (j >= pattern1.size()) { return false; } // // lastly, for optimization, we check how many itemsets are left to be matched. // // if there is less itemsets left in pattern1 than in pattern2, then it will // // be impossible to get a total match, and so we return false. if ((pattern1.size() - j) < pattern2.size() - i) { return false; } } }
/** * This is the dfsPruning method as described in the SPAM paper. * * @param prefix the current prefix * @param prefixBitmap the bitmap corresponding to the current prefix * @param sn a list of items to be considered for i-steps * @param in a list of items to be considered for s-steps * @param hasToBeGreaterThanForIStep * @param m size of the current prefix in terms of items * @param lastAppendedItem the last appended item to the prefix * @throws IOException if there is an error writing a pattern to the output file * @return TRUE IF A FREQUENT PATTERN WAS CREATED USING THE PREFIX. */ boolean dfsPruning( PrefixVMSP prefix, Bitmap prefixBitmap, List<Integer> sn, List<Integer> in, int hasToBeGreaterThanForIStep, int m, Integer lastAppendedItem) throws IOException { boolean atLeastOneFrequentExtension = false; // System.out.println(prefix.toString()); // ====== S-STEPS ====== // Temporary variables (as described in the paper) List<Integer> sTemp = new ArrayList<Integer>(); List<Bitmap> sTempBitmaps = new ArrayList<Bitmap>(); // for CMAP pruning, we will only check against the last appended item Map<Integer, Integer> mapSupportItemsAfter = coocMapAfter.get(lastAppendedItem); // for each item in sn loopi: for (Integer i : sn) { // LAST POSITION PRUNING /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) { // System.out.println("TEST"); continue loopi; }*/ // CMAP PRUNING // we only check with the last appended item if (useCMAPPruning) { if (mapSupportItemsAfter == null) { continue loopi; } Integer support = mapSupportItemsAfter.get(i); if (support == null || support < minsup) { // System.out.println("PRUNE"); continue loopi; } } // perform the S-STEP with that item to get a new bitmap Bitmap.INTERSECTION_COUNT++; Bitmap newBitmap = prefixBitmap.createNewBitmapSStep(verticalDB.get(i), sequencesSize, lastBitIndex, maxGap); // if the support is higher than minsup if (newBitmap.getSupportWithoutGapTotal() >= minsup) { // record that item and pattern in temporary variables sTemp.add(i); sTempBitmaps.add(newBitmap); } } // for each pattern recorded for the s-step for (int k = 0; k < sTemp.size(); k++) { // STRATEGY: NEWWW atLeastOneFrequentExtension = true; int item = sTemp.get(k); // create the new prefix PrefixVMSP prefixSStep = prefix.cloneSequence(); prefixSStep.addItemset(new Itemset(item)); if (item % 2 == 0) { prefixSStep.sumOfEvenItems = item + prefix.sumOfEvenItems; prefixSStep.sumOfOddItems = prefix.sumOfOddItems; } else { prefixSStep.sumOfEvenItems = prefix.sumOfEvenItems; prefixSStep.sumOfOddItems = item + prefix.sumOfOddItems; } // prefixSStep.sumOfItems = item + prefix.sumOfItems; // create the new bitmap Bitmap newBitmap = sTempBitmaps.get(k); // save the pattern to the file if (newBitmap.getSupport() >= minsup) { boolean hasFrequentExtension = false; // recursively try to extend that pattern if (maximumPatternLength > m) { hasFrequentExtension = dfsPruning(prefixSStep, newBitmap, sTemp, sTemp, item, m + 1, item); } if (hasFrequentExtension == false) { savePatternMultipleItems(prefixSStep, newBitmap, m); } } } Map<Integer, Integer> mapSupportItemsEquals = coocMapEquals.get(lastAppendedItem); // ======== I STEPS ======= // Temporary variables List<Integer> iTemp = new ArrayList<Integer>(); List<Bitmap> iTempBitmaps = new ArrayList<Bitmap>(); // for each item in in loop2: for (Integer i : in) { // the item has to be greater than the largest item // already in the last itemset of prefix. if (i > hasToBeGreaterThanForIStep) { // LAST POSITION PRUNING /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) { continue loop2; }*/ // CMAP PRUNING if (useCMAPPruning) { if (mapSupportItemsEquals == null) { continue loop2; } Integer support = mapSupportItemsEquals.get(i); if (support == null || support < minsup) { continue loop2; } } // Perform an i-step with this item and the current prefix. // This creates a new bitmap Bitmap.INTERSECTION_COUNT++; Bitmap newBitmap = prefixBitmap.createNewBitmapIStep(verticalDB.get(i), sequencesSize, lastBitIndex); // If the support is no less than minsup if (newBitmap.getSupport() >= minsup) { // record that item and pattern in temporary variables iTemp.add(i); iTempBitmaps.add(newBitmap); } } } // for each pattern recorded for the i-step for (int k = 0; k < iTemp.size(); k++) { // STRATEGY: NEWWW atLeastOneFrequentExtension = true; int item = iTemp.get(k); // create the new prefix PrefixVMSP prefixIStep = prefix.cloneSequence(); prefixIStep.getItemsets().get(prefixIStep.size() - 1).addItem(item); if (item % 2 == 0) { prefixIStep.sumOfEvenItems = item + prefix.sumOfEvenItems; prefixIStep.sumOfOddItems = prefix.sumOfOddItems; } else { prefixIStep.sumOfEvenItems = prefix.sumOfEvenItems; prefixIStep.sumOfOddItems = item + prefix.sumOfOddItems; } // create the new bitmap Bitmap newBitmap = iTempBitmaps.get(k); // recursively try to extend that pattern boolean hasFrequentExtension = false; if (maximumPatternLength > m) { hasFrequentExtension = dfsPruning(prefixIStep, newBitmap, sTemp, iTemp, item, m + 1, item); } if (hasFrequentExtension == false) { // save the pattern savePatternMultipleItems(prefixIStep, newBitmap, m); } } // check the memory usage MemoryLogger.getInstance().checkMemory(); return atLeastOneFrequentExtension || useStrategyForwardExtensionChecking == false; }
/** * Save a pattern of size 1 to the output file * * @param item the item * @param bitmap its bitmap * @param itemIsEven * @throws IOException exception if error while writing to the file * @return true if is subsumed */ private boolean savePatternSingleItem(Integer item, Bitmap bitmap, boolean itemIsEven) throws IOException { // System.out.println("prefix :" + prefix); // FOR THE CASE OF SINGLE ITEM, WE DON'T NEED TO DO SUB-PATTERN CHECKING: // WE JUST NEED TO DO SUPER-PATTERN CHECKING // ################# // IMPORTANT STRATEGY : FROM LARGER TO SMALLER...... AND IN ASCENDING SUPPORT ORDER // ################## if (itemIsEven) { for (int i = maxPatterns.size() - 1; i > 1; i--) { for (PatternVMSP pPrime : maxPatterns.get(i)) { if (pPrime.prefix.sumOfOddItems + pPrime.prefix.sumOfEvenItems < item) { break; } // if the pattern already found contains the single item if (pPrime.prefix.sumOfEvenItems > item && bitmap.getSupport() >= pPrime.support) { if (pPrime.prefix.containsItem(item)) { return true; } } } } } else { for (int i = maxPatterns.size() - 1; i > 1; i--) { for (PatternVMSP pPrime : maxPatterns.get(i)) { if (pPrime.prefix.sumOfOddItems + pPrime.prefix.sumOfEvenItems < item) { break; } // if the pattern already found contains the single item if (pPrime.prefix.sumOfOddItems > item && bitmap.getSupport() >= pPrime.support) { if (pPrime.prefix.containsItem(item)) { return true; } } } } } // OTHERWISE THE NEW PATTERN IS NOT SUBSUMMED patternCount++; // INCREASE COUNT PrefixVMSP prefix = new PrefixVMSP(); prefix.addItemset(new Itemset(item)); if (itemIsEven) { prefix.sumOfEvenItems = item; prefix.sumOfOddItems = 0; } else { prefix.sumOfEvenItems = 0; prefix.sumOfOddItems = item; } PatternVMSP pattern = new PatternVMSP(prefix, bitmap.getSupport()); // If the user wants to see the sequence identifiers, we need to keep the bitmap which stores // them if (outputSequenceIdentifiers) { pattern.bitmap = bitmap; } // save the pattern // System.out.println(" ADD: " + item); maxPatterns.get(1).add(pattern); return false; // END CHANGED ------ }
/** * This is the main method for the VMSP algorithm * * @param an input file * @param minsupRel the minimum support as a relative value * @throws IOException */ private void vmsp(String input, double minsupRel) throws IOException { // create maxPattern array maxPatterns = new ArrayList<TreeSet<PatternVMSP>>(20); maxPatterns.add(null); maxPatterns.add(new TreeSet<PatternVMSP>()); // the structure to store the vertical database // key: an item value : bitmap verticalDB = new HashMap<Integer, Bitmap>(); // structure to store the horizontal database List<int[]> inMemoryDB = new ArrayList<int[]>(); // STEP 0: SCAN THE DATABASE TO STORE THE FIRST BIT POSITION OF EACH SEQUENCE // AND CALCULATE THE TOTAL NUMBER OF BIT FOR EACH BITMAP sequencesSize = new ArrayList<Integer>(); lastBitIndex = 0; // variable to record the last bit position that we will use in bitmaps try { // read the file FileInputStream fin = new FileInputStream(new File(input)); BufferedReader reader = new BufferedReader(new InputStreamReader(fin)); String thisLine; int bitIndex = 0; // for each line (sequence) in the file until the end while ((thisLine = reader.readLine()) != null) { // if the line is a comment, is empty or is a // kind of metadata if (thisLine.isEmpty() == true || thisLine.charAt(0) == '#' || thisLine.charAt(0) == '%' || thisLine.charAt(0) == '@') { continue; } // record the length of the current sequence (for optimizations) sequencesSize.add(bitIndex); // split the sequence according to spaces into tokens String tokens[] = thisLine.split(" "); int[] transactionArray = new int[tokens.length]; inMemoryDB.add(transactionArray); for (int i = 0; i < tokens.length; i++) { int item = Integer.parseInt(tokens[i]); transactionArray[i] = item; // if it is not an itemset separator if (item == -1) { // indicate the end of an itemset // increase the number of bits that we will need for each bitmap bitIndex++; } } } // record the last bit position for the bitmaps lastBitIndex = bitIndex - 1; reader.close(); // close the input file } catch (Exception e) { e.printStackTrace(); } // Calculate the absolute minimum support // by multipling the percentage with the number of // sequences in this database // minsup = 163; minsup = (int) Math.ceil((minsupRel * sequencesSize.size())); if (minsup == 0) { minsup = 1; } // System.out.println("minsup : " + minsup); // STEP1: SCAN THE DATABASE TO CREATE THE BITMAP VERTICAL DATABASE REPRESENTATION try { FileInputStream fin = new FileInputStream(new File(input)); BufferedReader reader = new BufferedReader(new InputStreamReader(fin)); String thisLine; int sid = 0; // to know which sequence we are scanning int tid = 0; // to know which itemset we are scanning // for each line (sequence) from the input file while ((thisLine = reader.readLine()) != null) { // split the sequence according to spaces into tokens for (String token : thisLine.split(" ")) { if (token.equals("-1")) { // indicate the end of an itemset tid++; } else if (token.equals("-2")) { // indicate the end of a sequence // determineSection(bitindex - previousBitIndex); // register the sequence length // for the bitmap sid++; tid = 0; } else { // indicate an item // Get the bitmap for this item. If none, create one. Integer item = Integer.parseInt(token); Bitmap bitmapItem = verticalDB.get(item); if (bitmapItem == null) { bitmapItem = new Bitmap(lastBitIndex); verticalDB.put(item, bitmapItem); } // Register the bit in the bitmap for this item bitmapItem.registerBit(sid, tid, sequencesSize); } } } reader.close(); } catch (Exception e) { e.printStackTrace(); } // STEP2: REMOVE INFREQUENT ITEMS FROM THE DATABASE BECAUSE THEY WILL NOT APPEAR IN ANY FREQUENT // SEQUENTIAL PATTERNS List<Integer> frequentItems = new ArrayList<Integer>(); Iterator<Entry<Integer, Bitmap>> iter = verticalDB.entrySet().iterator(); // we iterate over items from the vertical database that we have in memory while (iter.hasNext()) { // we get the bitmap for this item Entry<Integer, Bitmap> entry = (Entry<Integer, Bitmap>) iter.next(); // if the cardinality of this bitmap is lower than minsup if (entry.getValue().getSupport() < minsup) { // we remove this item from the database. iter.remove(); } else { // otherwise, we save this item as a frequent // sequential pattern of size 1 // CHANGED // and we add this item to a list of frequent items // that we will use later. frequentItems.add(entry.getKey()); // END CHANGED } } // SET 2.1 SORT ITEMS BY DESCENDING SUPPORT Collections.sort( frequentItems, new Comparator<Integer>() { @Override public int compare(Integer arg0, Integer arg1) { return verticalDB.get(arg0).getSupport() - verticalDB.get(arg1).getSupport(); } }); // STEP 3.1 CREATE CMAP coocMapEquals = new HashMap<Integer, Map<Integer, Integer>>(frequentItems.size()); coocMapAfter = new HashMap<Integer, Map<Integer, Integer>>(frequentItems.size()); if (useLastPositionPruning) { lastItemPositionMap = new HashMap<Integer, Short>(frequentItems.size()); } for (int[] transaction : inMemoryDB) { short itemsetCount = 0; Set<Integer> alreadyProcessed = new HashSet<Integer>(); Map<Integer, Set<Integer>> equalProcessed = new HashMap<>(); loopI: for (int i = 0; i < transaction.length; i++) { Integer itemI = transaction[i]; Set<Integer> equalSet = equalProcessed.get(itemI); if (equalSet == null) { equalSet = new HashSet<Integer>(); equalProcessed.put(itemI, equalSet); } if (itemI < 0) { itemsetCount++; continue; } // System.out.println(itemsetCount); // update lastItemMap if (useLastPositionPruning) { Short last = lastItemPositionMap.get(itemI); if (last == null || last < itemsetCount) { lastItemPositionMap.put(itemI, itemsetCount); } } Bitmap bitmapOfItem = verticalDB.get(itemI); if (bitmapOfItem == null || bitmapOfItem.getSupport() < minsup) { continue; } Set<Integer> alreadyProcessedB = new HashSet<Integer>(); // NEW boolean sameItemset = true; for (int j = i + 1; j < transaction.length; j++) { Integer itemJ = transaction[j]; if (itemJ < 0) { sameItemset = false; continue; } Bitmap bitmapOfitemJ = verticalDB.get(itemJ); if (bitmapOfitemJ == null || bitmapOfitemJ.getSupport() < minsup) { continue; } // if (itemI != itemJ){ Map<Integer, Integer> map = null; if (sameItemset) { if (!equalSet.contains(itemJ)) { map = coocMapEquals.get(itemI); if (map == null) { map = new HashMap<Integer, Integer>(); coocMapEquals.put(itemI, map); } Integer support = map.get(itemJ); if (support == null) { map.put(itemJ, 1); } else { map.put(itemJ, ++support); } equalSet.add(itemJ); } } else if (!alreadyProcessedB.contains(itemJ)) { if (alreadyProcessed.contains(itemI)) { continue loopI; } map = coocMapAfter.get(itemI); if (map == null) { map = new HashMap<Integer, Integer>(); coocMapAfter.put(itemI, map); } Integer support = map.get(itemJ); if (support == null) { map.put(itemJ, 1); } else { map.put(itemJ, ++support); } alreadyProcessedB.add(itemJ); // NEW } } alreadyProcessed.add(itemI); } } // STEP3: WE PERFORM THE RECURSIVE DEPTH FIRST SEARCH // to find longer sequential patterns recursively if (maximumPatternLength == 1) { return; } // for each frequent item for (Entry<Integer, Bitmap> entry : verticalDB.entrySet()) { // We create a prefix with that item PrefixVMSP prefix = new PrefixVMSP(); prefix.addItemset(new Itemset(entry.getKey())); boolean itemIsEven = entry.getKey() % 2 == 0; if (itemIsEven) { prefix.sumOfEvenItems = (Integer) entry.getKey(); prefix.sumOfOddItems = 0; } else { prefix.sumOfEvenItems = 0; prefix.sumOfOddItems = (Integer) entry.getKey(); } boolean hasExtension = dfsPruning( prefix, entry.getValue(), frequentItems, frequentItems, entry.getKey(), 2, entry.getKey()); if (hasExtension == false) { savePatternSingleItem(entry.getKey(), entry.getValue(), itemIsEven); } // We call the depth first search method with that prefix // and the list of frequent items to try to find // larger sequential patterns by appending some of these // items. // if(!isSubsumedAndNonClosed) { // } } }