Esempio n. 1
0
  /**
   * This is the dfsPruning method as described in the SPAM paper.
   *
   * @param prefix the current prefix
   * @param prefixBitmap the bitmap corresponding to the current prefix
   * @param sn a list of items to be considered for i-steps
   * @param in a list of items to be considered for s-steps
   * @param hasToBeGreaterThanForIStep
   * @param m size of the current prefix in terms of items
   * @param lastAppendedItem the last appended item to the prefix
   * @throws IOException if there is an error writing a pattern to the output file
   * @return TRUE IF A FREQUENT PATTERN WAS CREATED USING THE PREFIX.
   */
  boolean dfsPruning(
      PrefixVMSP prefix,
      Bitmap prefixBitmap,
      List<Integer> sn,
      List<Integer> in,
      int hasToBeGreaterThanForIStep,
      int m,
      Integer lastAppendedItem)
      throws IOException {
    boolean atLeastOneFrequentExtension = false;
    //		System.out.println(prefix.toString());

    //  ======  S-STEPS ======
    // Temporary variables (as described in the paper)
    List<Integer> sTemp = new ArrayList<Integer>();
    List<Bitmap> sTempBitmaps = new ArrayList<Bitmap>();

    // for CMAP pruning, we will only check against the last appended item
    Map<Integer, Integer> mapSupportItemsAfter = coocMapAfter.get(lastAppendedItem);

    // for each item in sn
    loopi:
    for (Integer i : sn) {

      // LAST POSITION PRUNING
      /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) {
      //				System.out.println("TEST");
      continue loopi;
      }*/

      // CMAP PRUNING
      // we only check with the last appended item
      if (useCMAPPruning) {
        if (mapSupportItemsAfter == null) {
          continue loopi;
        }
        Integer support = mapSupportItemsAfter.get(i);
        if (support == null || support < minsup) {
          //							System.out.println("PRUNE");
          continue loopi;
        }
      }

      // perform the S-STEP with that item to get a new bitmap
      Bitmap.INTERSECTION_COUNT++;
      Bitmap newBitmap =
          prefixBitmap.createNewBitmapSStep(verticalDB.get(i), sequencesSize, lastBitIndex, maxGap);
      // if the support is higher than minsup
      if (newBitmap.getSupportWithoutGapTotal() >= minsup) {
        // record that item and pattern in temporary variables
        sTemp.add(i);
        sTempBitmaps.add(newBitmap);
      }
    }
    // for each pattern recorded for the s-step
    for (int k = 0; k < sTemp.size(); k++) {
      // STRATEGY: NEWWW
      atLeastOneFrequentExtension = true;

      int item = sTemp.get(k);
      // create the new prefix
      PrefixVMSP prefixSStep = prefix.cloneSequence();
      prefixSStep.addItemset(new Itemset(item));
      if (item % 2 == 0) {
        prefixSStep.sumOfEvenItems = item + prefix.sumOfEvenItems;
        prefixSStep.sumOfOddItems = prefix.sumOfOddItems;
      } else {
        prefixSStep.sumOfEvenItems = prefix.sumOfEvenItems;
        prefixSStep.sumOfOddItems = item + prefix.sumOfOddItems;
      }
      //            prefixSStep.sumOfItems = item + prefix.sumOfItems;

      // create the new bitmap
      Bitmap newBitmap = sTempBitmaps.get(k);

      // save the pattern to the file
      if (newBitmap.getSupport() >= minsup) {

        boolean hasFrequentExtension = false;
        // recursively try to extend that pattern
        if (maximumPatternLength > m) {
          hasFrequentExtension =
              dfsPruning(prefixSStep, newBitmap, sTemp, sTemp, item, m + 1, item);
        }

        if (hasFrequentExtension == false) {
          savePatternMultipleItems(prefixSStep, newBitmap, m);
        }
      }
    }

    Map<Integer, Integer> mapSupportItemsEquals = coocMapEquals.get(lastAppendedItem);
    // ========  I STEPS =======
    // Temporary variables
    List<Integer> iTemp = new ArrayList<Integer>();
    List<Bitmap> iTempBitmaps = new ArrayList<Bitmap>();

    // for each item in in
    loop2:
    for (Integer i : in) {

      // the item has to be greater than the largest item
      // already in the last itemset of prefix.
      if (i > hasToBeGreaterThanForIStep) {

        // LAST POSITION PRUNING
        /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) {
        continue loop2;
        }*/

        // CMAP PRUNING
        if (useCMAPPruning) {
          if (mapSupportItemsEquals == null) {
            continue loop2;
          }
          Integer support = mapSupportItemsEquals.get(i);
          if (support == null || support < minsup) {
            continue loop2;
          }
        }

        // Perform an i-step with this item and the current prefix.
        // This creates a new bitmap
        Bitmap.INTERSECTION_COUNT++;
        Bitmap newBitmap =
            prefixBitmap.createNewBitmapIStep(verticalDB.get(i), sequencesSize, lastBitIndex);
        // If the support is no less than minsup
        if (newBitmap.getSupport() >= minsup) {
          // record that item and pattern in temporary variables
          iTemp.add(i);
          iTempBitmaps.add(newBitmap);
        }
      }
    }
    // for each pattern recorded for the i-step
    for (int k = 0; k < iTemp.size(); k++) { // STRATEGY: NEWWW
      atLeastOneFrequentExtension = true;

      int item = iTemp.get(k);
      // create the new prefix
      PrefixVMSP prefixIStep = prefix.cloneSequence();
      prefixIStep.getItemsets().get(prefixIStep.size() - 1).addItem(item);
      if (item % 2 == 0) {
        prefixIStep.sumOfEvenItems = item + prefix.sumOfEvenItems;
        prefixIStep.sumOfOddItems = prefix.sumOfOddItems;
      } else {
        prefixIStep.sumOfEvenItems = prefix.sumOfEvenItems;
        prefixIStep.sumOfOddItems = item + prefix.sumOfOddItems;
      }
      // create the new bitmap
      Bitmap newBitmap = iTempBitmaps.get(k);

      // recursively try to extend that pattern
      boolean hasFrequentExtension = false;
      if (maximumPatternLength > m) {
        hasFrequentExtension = dfsPruning(prefixIStep, newBitmap, sTemp, iTemp, item, m + 1, item);
      }

      if (hasFrequentExtension == false) {
        // save the pattern
        savePatternMultipleItems(prefixIStep, newBitmap, m);
      }
    }
    // check the memory usage
    MemoryLogger.getInstance().checkMemory();

    return atLeastOneFrequentExtension || useStrategyForwardExtensionChecking == false;
  }
Esempio n. 2
0
  /**
   * Save a pattern of size 1 to the output file
   *
   * @param item the item
   * @param bitmap its bitmap
   * @param itemIsEven
   * @throws IOException exception if error while writing to the file
   * @return true if is subsumed
   */
  private boolean savePatternSingleItem(Integer item, Bitmap bitmap, boolean itemIsEven)
      throws IOException {
    //    	System.out.println("prefix :" + prefix);

    // FOR THE CASE OF SINGLE ITEM, WE DON'T NEED TO DO SUB-PATTERN CHECKING:
    // WE JUST NEED TO DO SUPER-PATTERN CHECKING
    // #################
    // IMPORTANT STRATEGY  :   FROM LARGER TO SMALLER......  AND IN ASCENDING SUPPORT ORDER
    // ##################
    if (itemIsEven) {
      for (int i = maxPatterns.size() - 1; i > 1; i--) {
        for (PatternVMSP pPrime : maxPatterns.get(i)) {

          if (pPrime.prefix.sumOfOddItems + pPrime.prefix.sumOfEvenItems < item) {
            break;
          }
          // if the pattern already found contains the single item
          if (pPrime.prefix.sumOfEvenItems > item && bitmap.getSupport() >= pPrime.support) {
            if (pPrime.prefix.containsItem(item)) {
              return true;
            }
          }
        }
      }
    } else {
      for (int i = maxPatterns.size() - 1; i > 1; i--) {
        for (PatternVMSP pPrime : maxPatterns.get(i)) {

          if (pPrime.prefix.sumOfOddItems + pPrime.prefix.sumOfEvenItems < item) {
            break;
          }
          // if the pattern already found contains the single item
          if (pPrime.prefix.sumOfOddItems > item && bitmap.getSupport() >= pPrime.support) {
            if (pPrime.prefix.containsItem(item)) {
              return true;
            }
          }
        }
      }
    }
    // OTHERWISE THE NEW PATTERN IS NOT SUBSUMMED
    patternCount++; // INCREASE COUNT
    PrefixVMSP prefix = new PrefixVMSP();
    prefix.addItemset(new Itemset(item));
    if (itemIsEven) {
      prefix.sumOfEvenItems = item;
      prefix.sumOfOddItems = 0;
    } else {
      prefix.sumOfEvenItems = 0;
      prefix.sumOfOddItems = item;
    }

    PatternVMSP pattern = new PatternVMSP(prefix, bitmap.getSupport());

    // If the user wants to see the sequence identifiers, we need to keep the bitmap which stores
    // them
    if (outputSequenceIdentifiers) {
      pattern.bitmap = bitmap;
    }
    // save the pattern
    //        System.out.println(" ADD: " + item);
    maxPatterns.get(1).add(pattern);

    return false;

    // END CHANGED ------
  }
Esempio n. 3
0
  /**
   * This is the main method for the VMSP algorithm
   *
   * @param an input file
   * @param minsupRel the minimum support as a relative value
   * @throws IOException
   */
  private void vmsp(String input, double minsupRel) throws IOException {
    // create maxPattern array
    maxPatterns = new ArrayList<TreeSet<PatternVMSP>>(20);
    maxPatterns.add(null);
    maxPatterns.add(new TreeSet<PatternVMSP>());

    // the structure to store the vertical database
    // key: an item    value : bitmap
    verticalDB = new HashMap<Integer, Bitmap>();

    // structure to store the horizontal database
    List<int[]> inMemoryDB = new ArrayList<int[]>();

    // STEP 0: SCAN THE DATABASE TO STORE THE FIRST BIT POSITION OF EACH SEQUENCE
    // AND CALCULATE THE TOTAL NUMBER OF BIT FOR EACH BITMAP
    sequencesSize = new ArrayList<Integer>();
    lastBitIndex = 0; // variable to record the last bit position that we will use in bitmaps
    try {
      // read the file
      FileInputStream fin = new FileInputStream(new File(input));
      BufferedReader reader = new BufferedReader(new InputStreamReader(fin));
      String thisLine;
      int bitIndex = 0;
      // for each line (sequence) in the file until the end
      while ((thisLine = reader.readLine()) != null) {
        // if the line is  a comment, is  empty or is a
        // kind of metadata
        if (thisLine.isEmpty() == true
            || thisLine.charAt(0) == '#'
            || thisLine.charAt(0) == '%'
            || thisLine.charAt(0) == '@') {
          continue;
        }

        // record the length of the current sequence (for optimizations)
        sequencesSize.add(bitIndex);
        // split the sequence according to spaces into tokens

        String tokens[] = thisLine.split(" ");
        int[] transactionArray = new int[tokens.length];
        inMemoryDB.add(transactionArray);

        for (int i = 0; i < tokens.length; i++) {
          int item = Integer.parseInt(tokens[i]);
          transactionArray[i] = item;
          // if it is not an itemset separator
          if (item == -1) { // indicate the end of an itemset
            // increase the number of bits that we will need for each bitmap
            bitIndex++;
          }
        }
      }
      // record the last bit position for the bitmaps
      lastBitIndex = bitIndex - 1;
      reader.close(); // close the input file
    } catch (Exception e) {
      e.printStackTrace();
    }
    // Calculate the absolute minimum support
    // by multipling the percentage with the number of
    // sequences in this database
    //		minsup = 163;
    minsup = (int) Math.ceil((minsupRel * sequencesSize.size()));
    if (minsup == 0) {
      minsup = 1;
    }
    //        System.out.println("minsup : " + minsup);

    // STEP1: SCAN THE DATABASE TO CREATE THE BITMAP VERTICAL DATABASE REPRESENTATION
    try {
      FileInputStream fin = new FileInputStream(new File(input));
      BufferedReader reader = new BufferedReader(new InputStreamReader(fin));
      String thisLine;
      int sid = 0; // to know which sequence we are scanning
      int tid = 0; // to know which itemset we are scanning

      // for each line (sequence) from the input file
      while ((thisLine = reader.readLine()) != null) {
        // split the sequence according to spaces into tokens
        for (String token : thisLine.split(" ")) {
          if (token.equals("-1")) { // indicate the end of an itemset
            tid++;
          } else if (token.equals("-2")) { // indicate the end of a sequence
            //						determineSection(bitindex - previousBitIndex);  // register the sequence length
            // for the bitmap
            sid++;
            tid = 0;
          } else { // indicate an item
            // Get the bitmap for this item. If none, create one.
            Integer item = Integer.parseInt(token);
            Bitmap bitmapItem = verticalDB.get(item);
            if (bitmapItem == null) {
              bitmapItem = new Bitmap(lastBitIndex);
              verticalDB.put(item, bitmapItem);
            }
            // Register the bit in the bitmap for this item
            bitmapItem.registerBit(sid, tid, sequencesSize);
          }
        }
      }
      reader.close();
    } catch (Exception e) {
      e.printStackTrace();
    }

    // STEP2: REMOVE INFREQUENT ITEMS FROM THE DATABASE BECAUSE THEY WILL NOT APPEAR IN ANY FREQUENT
    // SEQUENTIAL PATTERNS
    List<Integer> frequentItems = new ArrayList<Integer>();
    Iterator<Entry<Integer, Bitmap>> iter = verticalDB.entrySet().iterator();
    // we iterate over items from the vertical database that we have in memory
    while (iter.hasNext()) {
      //  we get the bitmap for this item
      Entry<Integer, Bitmap> entry = (Entry<Integer, Bitmap>) iter.next();
      // if the cardinality of this bitmap is lower than minsup
      if (entry.getValue().getSupport() < minsup) {
        // we remove this item from the database.
        iter.remove();
      } else {
        // otherwise, we save this item as a frequent
        // sequential pattern of size 1
        // CHANGED
        // and we add this item to a list of frequent items
        // that we will use later.
        frequentItems.add(entry.getKey());
        // END CHANGED
      }
    }

    // SET 2.1  SORT ITEMS BY DESCENDING SUPPORT
    Collections.sort(
        frequentItems,
        new Comparator<Integer>() {

          @Override
          public int compare(Integer arg0, Integer arg1) {
            return verticalDB.get(arg0).getSupport() - verticalDB.get(arg1).getSupport();
          }
        });

    // STEP 3.1  CREATE CMAP
    coocMapEquals = new HashMap<Integer, Map<Integer, Integer>>(frequentItems.size());
    coocMapAfter = new HashMap<Integer, Map<Integer, Integer>>(frequentItems.size());

    if (useLastPositionPruning) {
      lastItemPositionMap = new HashMap<Integer, Short>(frequentItems.size());
    }
    for (int[] transaction : inMemoryDB) {
      short itemsetCount = 0;

      Set<Integer> alreadyProcessed = new HashSet<Integer>();
      Map<Integer, Set<Integer>> equalProcessed = new HashMap<>();
      loopI:
      for (int i = 0; i < transaction.length; i++) {
        Integer itemI = transaction[i];

        Set<Integer> equalSet = equalProcessed.get(itemI);
        if (equalSet == null) {
          equalSet = new HashSet<Integer>();
          equalProcessed.put(itemI, equalSet);
        }

        if (itemI < 0) {
          itemsetCount++;
          continue;
        }
        //				System.out.println(itemsetCount);

        // update lastItemMap
        if (useLastPositionPruning) {
          Short last = lastItemPositionMap.get(itemI);
          if (last == null || last < itemsetCount) {
            lastItemPositionMap.put(itemI, itemsetCount);
          }
        }

        Bitmap bitmapOfItem = verticalDB.get(itemI);
        if (bitmapOfItem == null || bitmapOfItem.getSupport() < minsup) {
          continue;
        }

        Set<Integer> alreadyProcessedB = new HashSet<Integer>(); // NEW

        boolean sameItemset = true;
        for (int j = i + 1; j < transaction.length; j++) {
          Integer itemJ = transaction[j];

          if (itemJ < 0) {
            sameItemset = false;
            continue;
          }

          Bitmap bitmapOfitemJ = verticalDB.get(itemJ);
          if (bitmapOfitemJ == null || bitmapOfitemJ.getSupport() < minsup) {
            continue;
          }
          //									if (itemI != itemJ){
          Map<Integer, Integer> map = null;
          if (sameItemset) {
            if (!equalSet.contains(itemJ)) {
              map = coocMapEquals.get(itemI);
              if (map == null) {
                map = new HashMap<Integer, Integer>();
                coocMapEquals.put(itemI, map);
              }
              Integer support = map.get(itemJ);
              if (support == null) {
                map.put(itemJ, 1);
              } else {
                map.put(itemJ, ++support);
              }
              equalSet.add(itemJ);
            }
          } else if (!alreadyProcessedB.contains(itemJ)) {
            if (alreadyProcessed.contains(itemI)) {
              continue loopI;
            }
            map = coocMapAfter.get(itemI);
            if (map == null) {
              map = new HashMap<Integer, Integer>();
              coocMapAfter.put(itemI, map);
            }
            Integer support = map.get(itemJ);
            if (support == null) {
              map.put(itemJ, 1);
            } else {
              map.put(itemJ, ++support);
            }
            alreadyProcessedB.add(itemJ); // NEW
          }
        }
        alreadyProcessed.add(itemI);
      }
    }

    // STEP3: WE PERFORM THE RECURSIVE DEPTH FIRST SEARCH
    // to find longer sequential patterns recursively

    if (maximumPatternLength == 1) {
      return;
    }
    // for each frequent item
    for (Entry<Integer, Bitmap> entry : verticalDB.entrySet()) {
      // We create a prefix with that item
      PrefixVMSP prefix = new PrefixVMSP();
      prefix.addItemset(new Itemset(entry.getKey()));
      boolean itemIsEven = entry.getKey() % 2 == 0;
      if (itemIsEven) {
        prefix.sumOfEvenItems = (Integer) entry.getKey();
        prefix.sumOfOddItems = 0;
      } else {
        prefix.sumOfEvenItems = 0;
        prefix.sumOfOddItems = (Integer) entry.getKey();
      }

      boolean hasExtension =
          dfsPruning(
              prefix,
              entry.getValue(),
              frequentItems,
              frequentItems,
              entry.getKey(),
              2,
              entry.getKey());

      if (hasExtension == false) {
        savePatternSingleItem(entry.getKey(), entry.getValue(), itemIsEven);
      }

      // We call the depth first search method with that prefix
      // and the list of frequent items to try to find
      // larger sequential patterns by appending some of these
      // items.
      //            if(!isSubsumedAndNonClosed) {
      //            }
    }
  }