Ejemplo n.º 1
0
  /**
   * Actual call to SPAM algorithm. The output can be either kept or ignore. Whenever we choose to
   * keep the patterns found, we can keep them in a file or in the main memory
   *
   * @param database Original database in where we want to search for the frequent patterns.
   * @param keepPatterns Flag indicating if we want to keep the output or not
   * @param verbose Flag for debugging purposes
   * @param outputFilePath Path of the file in which we want to store the frequent patterns. If this
   *     value is null, we keep the patterns in the main memory. This argument is taken into account
   *     just when keepPatterns is activated.
   * @throws IOException
   */
  public void runAlgorithm(
      SequenceDatabase database, boolean keepPatterns, boolean verbose, String outputFilePath)
      throws IOException {
    // If we do no have any file path
    if (outputFilePath == null) {
      // The user wants to save the results in memory
      saver = new SaverIntoMemory();
    } else {
      // Otherwise, the user wants to save them in the given file
      saver = new SaverIntoFile(outputFilePath);
    }

    this.minSupAbsolute = (int) Math.ceil(minSupRelative * database.size());
    if (this.minSupAbsolute == 0) { // protection
      this.minSupAbsolute = 1;
    }
    // reset the stats about memory usage
    MemoryLogger.getInstance().reset();
    // keeping the starting time
    start = System.currentTimeMillis();
    // We run SPAM algorithm
    runSPAM(database, (long) minSupAbsolute, keepPatterns, verbose);
    // keeping the ending time
    end = System.currentTimeMillis();
    // Search for frequent patterns: Finished
    saver.finish();
  }
 /**
  * Method to find all frequent items in a projected sequence database
  *
  * @param sequences the set of sequences
  * @return A list of pairs, where a pair is an item with (1) a boolean indicating if it is in an
  *     itemset that is "cut" and (2) the sequence IDs where it occurs.
  */
 protected Set<Pair> findAllFrequentPairs(
     SequentialPattern prefix, List<PseudoSequence> sequences) {
   // We use a Map the store the pairs.
   Map<Pair, Pair> mapPairs = new HashMap<Pair, Pair>();
   // for each sequence
   for (PseudoSequence sequence : sequences) {
     // for each itemset
     for (int i = 0; i < sequence.size(); i++) {
       // for each item
       for (int j = 0; j < sequence.getSizeOfItemsetAt(i); j++) {
         String item = sequence.getItemAtInItemsetAt(j, i);
         // create the pair corresponding to this item
         Pair paire = new Pair(sequence.isPostfix(i), item); // false is ok?
         // get the pair object store in the map if there is one already
         Pair oldPaire = mapPairs.get(paire);
         // if there is no pair object yet
         if (oldPaire == null) {
           // store the pair object that we created
           mapPairs.put(paire, paire);
         } else {
           // otherwise use the old one
           paire = oldPaire;
         }
         // record the current sequence id for that pair
         paire.getSequencesID().add(sequence.getId());
       }
     }
   }
   MemoryLogger.getInstance().checkMemory(); // check the memory for statistics.
   // return the map of pairs
   return mapPairs.keySet();
 }
Ejemplo n.º 3
0
  /** Print the statistics of the algorithm execution to System.out. */
  public void printStatistics() {
    StringBuilder r = new StringBuilder(200);
    r.append("=============  Algorithm VMSP - STATISTICS =============\n Total time ~ ");
    r.append(endTime - startTime);
    r.append(" ms\n");
    r.append(" Frequent sequences count : " + patternCount);
    r.append('\n');
    r.append(" Max memory (mb) : ");
    r.append(MemoryLogger.getInstance().getMaxMemory());
    r.append(patternCount);
    r.append('\n');
    r.append("minsup " + minsup);
    r.append('\n');
    r.append("Intersection count " + Bitmap.INTERSECTION_COUNT + " \n");
    r.append("===================================================\n");

    //        // PRINT PATTERNS
    //        System.out.println("PATTERNS FOUND ===============");
    //        for(Entry<Integer, List<Pattern>> entry : maxPatterns.entrySet()) {
    //        	for(Pattern  pat1: entry.getValue()) {
    //        		System.out.println(pat1.prefix.toString());
    //
    //        		for(Entry<Integer, List<Pattern>> entry2 : maxPatterns.entrySet()) {
    //        	        	for(Pattern  pat2: entry2.getValue()) {
    //        	        		if(pat1 != pat2 && strictlyContains(pat1.prefix, pat2.prefix)) {
    //        	        			System.out.println("REDUNDANT : " + pat1.prefix + "   " + pat2.prefix);
    //        	        		}
    //
    //        	        	}
    //    	        }
    //        	}
    //        }
    System.out.println(r.toString());
  }
Ejemplo n.º 4
0
  /**
   * Run the algorithm.
   *
   * @param k the value of k.
   * @param minConfidence the minimum confidence threshold.
   * @param database the database.
   */
  public void runAlgorithm(int k, double minConfidence, Database database) {
    // reset statistics
    MemoryLogger.getInstance().reset(); // reset utility to check memory usage
    maxCandidateCount = 0;

    // save parameters
    this.minConfidence = minConfidence;
    this.database = database;
    this.k = k;

    // prepare internal variables and structures
    this.minsuppRelative = 1;
    tableItemTids = new BitSet[database.maxItem + 1]; // id item, count
    tableItemCount = new int[database.maxItem + 1];
    kRules = new PriorityQueue<RuleG>();
    candidates = new RedBlackTree<RuleG>();

    // record the start time
    timeStart = System.currentTimeMillis();

    // perform the first database scan to generate vertical database representation
    scanDatabase(database);

    // start the generation of rules
    start();

    // record the end time
    timeEnd = System.currentTimeMillis();
  }
  /**
   * Method to recursively grow a given sequential pattern.
   *
   * @param prefix the current sequential pattern that we want to try to grow
   * @param database the current projected sequence database
   * @throws IOException exception if there is an error writing to the output file
   */
  private void recursion(SequentialPattern prefix, List<PseudoSequence> database)
      throws IOException {
    // find frequent items of size 1 in the current projected database.
    Set<Pair> pairs = findAllFrequentPairs(prefix, database);

    // For each pair found (a pair is an item with a boolean indicating if it
    // appears in an itemset that is cut (a postfix) or not, and the sequence IDs
    // where it appears in the projected database).
    for (Pair pair : pairs) {
      // if the item is frequent in the current projected database
      if (pair.getCount() >= minsuppAbsolute) {
        // create the new postfix by appending this item to the prefix
        SequentialPattern newPrefix;
        // if the item is part of a postfix
        if (pair.isPostfix()) {
          // we append it to the last itemset of the prefix
          newPrefix = appendItemToPrefixOfSequence(prefix, pair.getItem());
        } else { // else, we append it as a new itemset to the sequence
          newPrefix = appendItemToSequence(prefix, pair.getItem());
        }
        // build the projected database with this item
        List<PseudoSequence> projectedDB =
            buildProjectedContext(pair.getItem(), database, pair.isPostfix());

        newPrefix.setSequencesID(pair.getSequencesID());
        // save the pattern
        savePattern(newPrefix);
        // make a recursive call
        recursion(newPrefix, projectedDB);
      }
    }
    MemoryLogger.getInstance().checkMemory();
  }
Ejemplo n.º 6
0
  /**
   * Method to run the algorithm
   *
   * @param input path to an input file
   * @param outputFilePath path for writing the output file
   * @param minsupRel the minimum support as a relative value
   * @throws IOException exception if error while writing the file or reading
   */
  public List<TreeSet<PatternVMSP>> runAlgorithm(
      String input, String outputFilePath, double minsupRel) throws IOException {
    Bitmap.INTERSECTION_COUNT = 0;
    // create an object to write the file
    writer = new BufferedWriter(new FileWriter(outputFilePath));
    // initialize the number of patterns found
    patternCount = 0;
    // to log the memory used
    MemoryLogger.getInstance().reset();

    // record start time
    startTime = System.currentTimeMillis();
    // RUN THE ALGORITHM
    vmsp(input, minsupRel);
    // record end time
    endTime = System.currentTimeMillis();
    // save result to the file
    writeResultTofile(outputFilePath);
    // close the file
    writer.close();

    // PRINT PATTTERNS FOUND
    //        for(TreeSet<Pattern> tree : maxPatterns) {
    //        	if(tree == null) {
    //        		continue;
    //        	}
    //        	for(Pattern pat : tree) {
    ////        		System.out.println(" " + pat.prefix);
    //        	}
    //        }
    return maxPatterns;
  }
 /** Print statistics about the last algorithm execution to System.out. */
 public void printStats() {
   System.out.println("=============  TRULEGROWTH - STATS =============");
   //		System.out.println("minsup: " + minsuppRelative);
   System.out.println("Sequential rules count: " + ruleCount);
   System.out.println("Total time : " + (timeEnd - timeStart) + " ms");
   System.out.println("Max memory (mb)" + MemoryLogger.getInstance().getMaxMemory());
   System.out.println("=====================================");
 }
Ejemplo n.º 8
0
 /** Print statistics about the last algorithm execution. */
 public void printStats() {
   System.out.println("=============  TOP-K RULES - STATS =============");
   System.out.println("Minsup : " + minsuppRelative);
   System.out.println("Rules count: " + kRules.size());
   System.out.println("Memory : " + MemoryLogger.getInstance().getMaxMemory() + " mb");
   System.out.println("Total time : " + (timeEnd - timeStart) + " ms");
   System.out.println("===================================================");
 }
Ejemplo n.º 9
0
 /** Print statistics about the algorithm execution to System.out. */
 public void printStats() {
   System.out.println("=============  FP-GROWTH 0.96r14 - STATS =============");
   long temps = endTime - startTimestamp;
   System.out.println(" Transactions count from database : " + transactionCount);
   System.out.print(" Max memory usage: " + MemoryLogger.getInstance().getMaxMemory() + " mb \n");
   System.out.println(" Frequent itemsets count : " + itemsetCount);
   System.out.println(" Total time ~ " + temps + " ms");
   System.out.println("===================================================");
 }
 /** Print statistics about the algorithm execution to System.out. */
 public void printStats() {
   System.out.println("=============  DECLAT vALTERNATE-Bitset v0.96r4- STATS =============");
   long temps = endTime - startTimestamp;
   System.out.println(" Transactions count from database : " + database.size());
   System.out.println(" Frequent itemsets count : " + itemsetCount);
   System.out.println(" Total time ~ " + temps + " ms");
   System.out.println(
       " Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb");
   System.out.println("===================================================");
 }
Ejemplo n.º 11
0
 /** Print statistics about the algorithm execution to System.out. */
 public void printStats() {
   System.out.println("=============  APRIORI - STATS =============");
   System.out.println(" Candidates count : " + totalCandidateCount);
   System.out.println(
       " The algorithm stopped at size " + (k - 1) + ", because there is no candidate");
   System.out.println(" Frequent itemsets count : " + itemsetCount);
   System.out.println(
       " Maximum memory usage : " + MemoryLogger.getInstance().getMaxMemory() + " mb");
   System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms");
   System.out.println("===================================================");
 }
Ejemplo n.º 12
0
  /**
   * Register a given rule in the set of candidates for future expansions
   *
   * @param expandLR if true the rule will be considered for left/right expansions otherwise only
   *     right.
   * @param rule the given rule
   */
  private void registerAsCandidate(boolean expandLR, RuleG rule) {
    // add the rule to candidates
    rule.expandLR = expandLR;
    candidates.add(rule);

    // record the maximum number of candidates for statistics
    if (candidates.size() >= maxCandidateCount) {
      maxCandidateCount = candidates.size();
    }
    // check the memory usage
    MemoryLogger.getInstance().checkMemory();
  }
Ejemplo n.º 13
0
  /**
   * This is the recursive method to find all high utility itemsets. It writes the itemsets to the
   * output file.
   *
   * @param prefix This is the current prefix. Initially, it is empty.
   * @param pUL This is the Utility List of the prefix. Initially, it is empty.
   * @param ULs The utility lists corresponding to each extension of the prefix.
   * @param minUtility The minUtility threshold.
   * @param prefixLength The current prefix length
   * @throws IOException
   */
  private void fhm(
      int[] prefix, int prefixLength, UtilityList pUL, List<UtilityList> ULs, int minUtility)
      throws IOException {

    // For each extension X of prefix P
    for (int i = 0; i < ULs.size(); i++) {
      UtilityList X = ULs.get(i);

      // If pX is a high utility itemset.
      // we save the itemset:  pX
      if (X.sumIutils >= minUtility) {
        // save to file
        writeOut(prefix, prefixLength, X.item, X.sumIutils);
      }

      // If the sum of the remaining utilities for pX
      // is higher than minUtility, we explore extensions of pX.
      // (this is the pruning condition)
      if (X.sumIutils + X.sumRutils >= minUtility) {
        // This list will contain the utility lists of pX extensions.
        List<UtilityList> exULs = new ArrayList<UtilityList>();
        // For each extension of p appearing
        // after X according to the ascending order
        for (int j = i + 1; j < ULs.size(); j++) {
          UtilityList Y = ULs.get(j);

          // ======================== NEW OPTIMIZATION USED IN FHM
          Map<Integer, Long> mapTWUF = mapFMAP.get(X.item);
          if (mapTWUF != null) {
            Long twuF = mapTWUF.get(Y.item);
            if (twuF != null && twuF < minUtility) {
              continue;
            }
          }
          candidateCount++;
          // =========================== END OF NEW OPTIMIZATION

          // we construct the extension pXY
          // and add it to the list of extensions of pX
          UtilityList temp = construct(pUL, X, Y, minUtility);
          if (temp != null) {
            exULs.add(temp);
          }
        }
        // We create new prefix pX
        itemsetBuffer[prefixLength] = X.item;
        // We make a recursive call to discover all itemsets with the prefix pXY
        fhm(itemsetBuffer, prefixLength + 1, X, exULs, minUtility);
      }
    }
    MemoryLogger.getInstance().checkMemory();
  }
Ejemplo n.º 14
0
 /**
  * Print statistics about the algorithm execution to System.out.
  *
  * @param size the size of the database
  */
 public void printStatistics(int size) {
   StringBuffer r = new StringBuffer(200);
   r.append("=============  Algorithm BIDE2 - STATISTICS =============\n Total time ~ ");
   r.append(endTime - startTime);
   r.append(" ms\n");
   r.append(" Closed sequential pattern count : ");
   r.append(patternCount);
   r.append('\n');
   r.append(" Max memory (mb):");
   r.append(MemoryLogger.getInstance().getMaxMemory());
   r.append('\n');
   r.append("===================================================\n");
   System.out.println(r.toString());
 }
Ejemplo n.º 15
0
 /** Print statistics about the algorithm execution time */
 public void printStatistics() {
   StringBuilder r = new StringBuilder(200);
   r.append("=============  LAPIN - STATISTICS =============\n Total time ~ ");
   r.append(endTime - startTime);
   r.append(" ms\n");
   r.append(" Frequent sequences count : " + patternCount);
   r.append('\n');
   r.append(" Max memory (mb) : ");
   r.append(MemoryLogger.getInstance().getMaxMemory());
   r.append(patternCount);
   r.append('\n');
   r.append("===================================================");
   System.out.println(r.toString());
 }
Ejemplo n.º 16
0
 public String printStatistics() {
   StringBuilder sb = new StringBuilder(200);
   sb.append("=============  Algorithm - STATISTICS =============\n Total time ~ ");
   sb.append(getRunningTime());
   sb.append(" ms\n");
   sb.append(" Frequent sequences count : ");
   sb.append(numberOfFrequentPatterns);
   sb.append('\n');
   sb.append(" Max memory (mb):");
   sb.append(MemoryLogger.getInstance().getMaxMemory());
   sb.append('\n');
   sb.append(saver.print());
   sb.append("\n===================================================\n");
   return sb.toString();
 }
Ejemplo n.º 17
0
  /**
   * Main method to run the algorithm
   *
   * @param input an input file path
   * @param outputFilePath an output file path
   * @param minsupRel the minimum support threshold as a percentage
   * @throws IOException exception when writting result to a file
   */
  public void runAlgorithm(String input, String outputFilePath, double minsupRel)
      throws IOException {
    this.input = input;
    // prepare file writer for saving result to file
    writer = new BufferedWriter(new FileWriter(outputFilePath));
    patternCount = 0;
    // reset tool to calculate max. memory usage
    MemoryLogger.getInstance().reset();

    startTime = System.currentTimeMillis();

    // launch the algorithm!
    lapin(input, minsupRel);

    endTime = System.currentTimeMillis();
    writer.close();
  }
Ejemplo n.º 18
0
 /** Print the statistics of the algorithm execution to System.out. */
 public void printStatistics() {
   StringBuilder r = new StringBuilder(200);
   r.append("=============  Algorithm - STATISTICS =============\n Total time ~ ");
   r.append(endTime - startTime);
   r.append(" ms\n");
   r.append(" Frequent sequences count : " + patternCount);
   r.append('\n');
   r.append(" Max memory (mb) : ");
   r.append(MemoryLogger.getInstance().getMaxMemory());
   r.append(patternCount);
   r.append('\n');
   r.append("minsup " + minsup);
   r.append('\n');
   r.append("Intersection count " + Bitmap.INTERSECTION_COUNT + " \n");
   r.append("===================================================\n");
   System.out.println(r.toString());
 }
Ejemplo n.º 19
0
 /**
  * Print statistics about the algorithm execution to System.out.
  *
  * @param size the size of the database
  */
 public void printStatistics(int size) {
   StringBuilder r = new StringBuilder(200);
   r.append("=============  Algorithm MaxSP - STATISTICS =============\n Total time ~ ");
   r.append(endTime - startTime);
   r.append(" ms\n");
   r.append(" Maximal sequential pattern count : ");
   r.append(patternCount);
   r.append('\n');
   r.append(" Max memory (mb):");
   r.append(MemoryLogger.getInstance().getMaxMemory());
   r.append('\n');
   r.append("===================================================\n");
   System.out.println(r.toString());
   //		System.out.println("Frequent pairs time : " + debugFrequentPairsTime);
   //		System.out.println("Generate ith period time: " + debugithPeriodTime);
   //		System.out.println("Add pair time: " + debugAddPairTime);
   //		System.out.println("Project DB time: " + debugProjectDBTime);
 }
  /**
   * Run the algorithm
   *
   * @param database : a sequence database
   * @param minsup : the minimum support as an integer
   * @param outputFilePath : the path of the output file to save the result or null if you want the
   *     result to be saved into memory
   * @return return the result, if saved into memory, otherwise null
   * @throws IOException exception if error while writing the file
   */
  public SequentialPatterns runAlgorithm(
      SequenceDatabase database, String outputFilePath, int minsup) throws IOException {
    // initialize variables for statistics
    patternCount = 0;
    MemoryLogger.getInstance().reset(); // to check the memory usage

    // keep the minimum support because we will need it
    this.minsuppAbsolute = minsup;
    // save the start time
    startTime = System.currentTimeMillis();
    // run the algorithm
    prefixSpan(database, outputFilePath);
    // save the end time
    endTime = System.currentTimeMillis();

    // close the output file if the result was saved to a file
    if (writer != null) {
      writer.close();
    }
    return patterns;
  }
Ejemplo n.º 21
0
  /**
   * Method to run the algorithm
   *
   * @param input path to an input file
   * @param outputFilePath path for writing the output file
   * @param minsupRel the minimum support as a relative value
   * @param outputSequenceIdentifiers if true, sequence ids will be shown with each output pattern
   * @throws IOException exception if error while writing the file or reading
   */
  public void runAlgorithm(
      String input, String outputFilePath, double minsupRel, boolean outputSequenceIdentifiers)
      throws IOException {
    this.outputSequenceIdentifiers = outputSequenceIdentifiers;

    Bitmap.INTERSECTION_COUNT = 0;
    // create an object to write the file
    writer = new BufferedWriter(new FileWriter(outputFilePath));
    // initialize the number of patterns found
    patternCount = 0;
    // to log the memory used
    MemoryLogger.getInstance().reset();

    // record start time
    startTime = System.currentTimeMillis();
    // RUN THE ALGORITHM
    spam(input, minsupRel);
    // record end time
    endTime = System.currentTimeMillis();
    // close the file
    writer.close();
  }
Ejemplo n.º 22
0
  /**
   * Print statistics about the latest execution to System.out.
   *
   * @throws IOException
   */
  public void printStats() throws IOException {
    System.out.println("=============  FHM ALGORITHM v0.96r18 - STATS =============");
    System.out.println(" Total time ~ " + (endTimestamp - startTimestamp) + " ms");
    System.out.println(" Memory ~ " + MemoryLogger.getInstance().getMaxMemory() + " MB");
    System.out.println(" High-utility itemsets count : " + huiCount);
    System.out.println(" Candidate count : " + candidateCount);

    if (DEBUG) {
      int pairCount = 0;
      double maxMemory = getObjectSize(mapFMAP);
      for (Entry<Integer, Map<Integer, Long>> entry : mapFMAP.entrySet()) {
        maxMemory += getObjectSize(entry.getKey());
        for (Entry<Integer, Long> entry2 : entry.getValue().entrySet()) {
          pairCount++;
          maxMemory += getObjectSize(entry2.getKey()) + getObjectSize(entry2.getValue());
        }
      }
      System.out.println("CMAP size " + maxMemory + " MB");
      System.out.println("PAIR COUNT " + pairCount);
    }
    System.out.println("===================================================");
  }
Ejemplo n.º 23
0
  /**
   * Run the algorithm
   *
   * @param database a sequence database
   * @param outputPath an output file path
   * @param minsup a minimum support as an integer representing a number of sequences
   * @return return the result, if saved into memory, otherwise null
   * @throws IOException exception if error while writing the file
   */
  public SequentialPatterns runAlgorithm(SequenceDatabase database, String outputPath, int minsup)
      throws IOException {

    // save minsup
    this.minsuppAbsolute = minsup;
    // reset the counter for the number of patterns found
    patternCount = 0;
    // reset the stats about memory usage
    MemoryLogger.getInstance().reset();
    // save the start time
    startTime = System.currentTimeMillis();
    // start the algorithm
    bide(database, outputPath);
    // save the end time
    endTime = System.currentTimeMillis();

    // close the output file if the result was saved to a file
    if (writer != null) {
      writer.close();
    }
    return patterns;
  }
Ejemplo n.º 24
0
  /**
   * Method to find all frequent items in a projected sequence database
   *
   * @param sequences the set of sequences
   * @return A list of pairs, where a pair is an item with (1) booleans indicating if it is in an
   *     itemset that is "cut" at left or right (prefix or postfix) and (2) the sequence IDs where
   *     it occurs.
   */
  protected Set<PairBIDE> findAllFrequentPairs(
      SequentialPattern prefix, List<PseudoSequenceBIDE> sequences) {
    // We use a Map the store the pairs.
    Map<PairBIDE, PairBIDE> mapPairs = new HashMap<PairBIDE, PairBIDE>();

    // for each sequence
    for (PseudoSequenceBIDE sequence : sequences) {
      // for each itemset
      for (int i = 0; i < sequence.size(); i++) {
        // for each item
        for (int j = 0; j < sequence.getSizeOfItemsetAt(i); j++) {
          Integer item = sequence.getItemAtInItemsetAt(j, i);
          // create the pair corresponding to this item
          PairBIDE pair = new PairBIDE(sequence.isCutAtRight(i), sequence.isPostfix(i), item);
          // register this sequenceID for that pair.
          addPairWithoutCheck(mapPairs, sequence.getId(), pair);
        }
      }
    }
    // check the memory usage
    MemoryLogger.getInstance().checkMemory();
    return mapPairs.keySet(); // return the pairs.
  }
Ejemplo n.º 25
0
  /**
   * The actual method for extracting frequent sequences.
   *
   * @param database The original database
   * @param minSupportAbsolute the absolute minimum support
   * @param keepPatterns flag indicating if we are interested in keeping the output of the algorithm
   * @param verbose Flag for debugging purposes
   */
  protected void runSPAM(
      SequenceDatabase database, long minSupportAbsolute, boolean keepPatterns, boolean verbose) {

    // We get the equivalence classes formed by the frequent 1-patterns
    frequentItems = database.frequentItems();
    // We extract their patterns
    Collection<Pattern> size1sequences = getPatterns(frequentItems);
    // If we want to keep the output
    if (keepPatterns) {
      for (Pattern atom : size1sequences) {
        // We keep all the frequent 1-patterns
        saver.savePattern(atom);
      }
    }

    database = null;

    // We define the root class
    EquivalenceClass rootClass = new EquivalenceClass(null);
    /*And we insert the equivalence classes corresponding to the frequent
    1-patterns as its members*/
    for (EquivalenceClass atom : frequentItems) {
      rootClass.addClassMember(atom);
    }

    // Inizialitation of the class that is in charge of find the frequent patterns
    FrequentPatternEnumeration_SPAM frequentPatternEnumeration =
        new FrequentPatternEnumeration_SPAM(minSupAbsolute, saver);
    // We execute the search
    frequentPatternEnumeration.execute(rootClass, keepPatterns, verbose);

    // Once we had finished, we keep the number of frequent patterns that we found
    numberOfFrequentPatterns = frequentPatternEnumeration.getFrequentPatterns();
    // check the memory usage for statistics
    MemoryLogger.getInstance().checkMemory();
  }
Ejemplo n.º 26
0
  /**
   * Run the LAPIN algorithm
   *
   * @param input the input file path
   * @param minsupRel the minsup threshold as a percentage
   */
  private void lapin(String input, double minsupRel) throws IOException {

    if (DEBUG) {
      System.out.println(
          "=== First database scan to count number of sequences and support of single items ===");
    }

    // FIRST DATABASE SCAN: SCAN THE DATABASE TO COUNT
    //  - THE NUMBER OF SEQUENCES
    //  - THE SUPPORT OF EACH SINGLE ITEM
    // - THE LARGEST ITEM ID
    int sequenceCount = 0;
    int largestItemID = 0;
    // This map will store for each item (key) the first position where the item appears in each
    // sequence where it appears (value)
    Map<Integer, List<Position>> mapItemFirstOccurrences = new HashMap<Integer, List<Position>>();
    try {
      // Read the input file
      BufferedReader reader =
          new BufferedReader(new InputStreamReader(new FileInputStream(new File(input))));
      String thisLine;
      // for each sequence of the input fiel
      while ((thisLine = reader.readLine()) != null) {
        // we use a set to remember which item have been seen already
        Set<Integer> itemsAlreadySeen = new HashSet<Integer>();
        // to know the itemset number
        short itemsetID = 0;
        // for each token in this line
        for (String integer : thisLine.split(" ")) {
          // if it is the end of an itemset
          if ("-1".equals(integer)) {
            itemsetID++;
          } else if ("-2".equals(integer)) { // if it is the end of line
            // nothing to do here
          } else {
            // otherwise, it is an item
            Integer item = Integer.valueOf(integer);
            // if this item was not seen already in that sequence
            if (itemsAlreadySeen.contains(item) == false) {
              // Get the list of positions of that item
              List<Position> list = mapItemFirstOccurrences.get(item);
              // if that list is null, create a new list
              if (list == null) {
                list = new ArrayList<Position>();
                mapItemFirstOccurrences.put(item, list);
              }
              // Add the position of the item in that sequence to the list of first positions
              // of that item
              Position position = new Position(sequenceCount, itemsetID);
              list.add(position);
              // Remember that we have seen this item
              itemsAlreadySeen.add(item);
              // Check if the item is the largest item until now
              if (item > largestItemID) {
                largestItemID = item;
              }
            }
          }
        }
        // Increase the count of sequences from the input file
        sequenceCount++;
      }
      reader.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
    ;

    // Initialize the list of tables
    tables = new Table[sequenceCount];

    // Calculate absolute minimum support  as a number of sequences
    minsup = (int) Math.ceil(minsupRel * sequenceCount);
    if (minsup == 0) {
      minsup = 1;
    }

    if (DEBUG) {
      System.out.println("Number of items: " + mapItemFirstOccurrences.size());
      System.out.println("Sequence count:  " + sequenceCount);
      System.out.println("Abs. minsup: " + minsup + " sequences");
      System.out.println("Rel. minsup: " + minsupRel + " %");

      System.out.println("=== Determining the frequent items ===");
    }

    //		// For each frequent item,  save it and add it to the list of frequent items
    List<Integer> frequentItems = new ArrayList<Integer>();
    for (Entry<Integer, List<Position>> entry : mapItemFirstOccurrences.entrySet()) {
      // Get the border created by this item
      List<Position> itemBorder = entry.getValue();
      // if the item is frequent
      if (itemBorder.size() >= minsup) {
        // Output the item and add it to the list of frequent items
        Integer item = entry.getKey();
        savePattern(item, itemBorder.size());
        frequentItems.add(item);
        if (DEBUG) {
          System.out.println(" Item " + item + " is frequent with support = " + itemBorder.size());
        }
      }
    }

    if (DEBUG) {
      System.out.println("=== Second database scan to construct item-is-exist tables ===");
    }
    // sort the frequent items (useful when generating 2-IE-sequences, later on).
    Collections.sort(frequentItems);

    // SECOND DATABASE SCAN:
    // Now we will read the database again to create the Item-is-exist-table
    // and SE-position-lists and count support of 2-IE-sequences
    matrixPairCount = new SparseTriangularMatrix(largestItemID + 1);

    // Initialise the IE position lists and SE position lists
    sePositionList = new SEPositionList[sequenceCount];
    iePositionList = new IEPositionList[sequenceCount];

    try {
      // Prepare to read the file
      BufferedReader reader =
          new BufferedReader(new InputStreamReader(new FileInputStream(new File(input))));
      String thisLine;
      // For each sequence in the file
      int currentSequenceID = 0;
      while ((thisLine = reader.readLine()) != null) {

        // (1) ------- PARSE THE SEQUENCE BACKWARD TO CREATE THE ITEM-IS-EXIST TABLE FOR THATS
        // SEQUENCE
        // AND COUNT THE SUPPORT OF 2-IE-Sequences

        // We will also use a structure to remember in which sequence we have seen each pair of
        // items
        // Note that in this structure, we will add +1 to the sid because by default the matrix is
        // filled with 0
        // and we don't want to think that the first sequence was already seen for all pairs.
        AbstractTriangularMatrix matrixPairLastSeenInSID =
            new SparseTriangularMatrix(largestItemID + 1);

        // We count the number of positions (number of itemsets).
        // To do that we count the number of "-" symbols in the file.
        // We need to subtract 1 because the end of line "-2" contains "-".
        int positionCount = -1;
        for (char caracter : thisLine.toCharArray()) {
          if (caracter == '-') {
            positionCount++;
          }
        }

        // Now we will scan the sequence again.
        // This time we will remember which item were seen already
        Set<Integer> itemsAlreadySeen = new HashSet<Integer>();

        // During this scan, we will create the table for this sequence
        Table table = new Table();

        // To do that, we first create an initial position vector for that table
        BitSet currentBitset = new BitSet(mapItemFirstOccurrences.size()); // OK ?

        // This variable will be used to remember if a new item appeared in the current itemset
        boolean seenNewItem = false;

        // We will scan the sequence backward, starting from the end because
        // we should not create a bit vector for all positions but for only
        // the positions that are different from the previous one.
        String[] tokens = thisLine.split(" ");
        // This is the number of itemsets
        int currentPosition = positionCount;
        // to keep the current itemset in memory
        List<Integer> currentItemset = new ArrayList<Integer>();
        // For each token in that sequence
        for (int i = tokens.length - 1; i >= 0; i--) {
          // get the token
          String token = tokens[i];

          // if we reached the end of an itemset
          if ("-1".equals(token)) {
            // update the triangular matrix for counting 2-IE-sequences
            // by comparing each pairs of items in the current itemset
            for (int k = 0; k < currentItemset.size(); k++) {
              Integer item1 = currentItemset.get(k);
              for (int m = k + 1; m < currentItemset.size(); m++) {
                Integer item2 = currentItemset.get(m);

                // if that pair is frequent
                int sid = matrixPairLastSeenInSID.getSupportForItems(item1, item2);
                // and if we have not seen this sequence yet
                if (sid != currentSequenceID + 1) {
                  // increment support count of this pair
                  matrixPairCount.incrementCount(item1, item2);
                  // remember that we have seen this pair so that we don't count it again
                  matrixPairLastSeenInSID.setSupport(item1, item2, currentSequenceID + 1);
                }
              }
            }
            currentItemset.clear();
            // Decrease the current index of the position (itemset) in the sequence
            currentPosition--;
            // if the bit vector has changed since previous position, then
            // we need to add a new bit vector to the table
            if (seenNewItem) {
              // create the position vector and add it to the item-is-exist table
              PositionVector vector =
                  new PositionVector(currentPosition, (BitSet) currentBitset.clone());
              table.add(vector);
            }

          } else if ("-2".equals(token)) { // if end of sequence, nothing to do

          } else {
            // otherwise, it is an item
            Integer item = Integer.valueOf(token);
            if (mapItemFirstOccurrences.get(item).size() >= minsup) { // only for frequent items
              // if first time that we see this item
              if (itemsAlreadySeen.contains(item) == false) {
                // remember that we have seen a new item
                seenNewItem = true;
                // remember that we have seen this item
                itemsAlreadySeen.add(item);
                // add this item to the current bit vector
                currentBitset.set(item);
              }
              // add this item to the current itemset
              currentItemset.add(item);
            }
          }
        }

        // Lastly,
        // update the triangular matrix for counting 2-IE-sequences one more time
        // for the case where the pair is in first position of the sequence
        // by considering each pair of items in the last itemset.
        // This is done like it was done above, so I will not comment this part of the code again.
        for (int k = 0; k < currentItemset.size(); k++) {
          Integer item1 = currentItemset.get(k);
          for (int m = k + 1; m < currentItemset.size(); m++) {
            Integer item2 = currentItemset.get(m);
            // if th
            int sid = matrixPairLastSeenInSID.getSupportForItems(item1, item2);
            if (sid != currentSequenceID + 1) {
              matrixPairCount.incrementCount(item1, item2);
              matrixPairLastSeenInSID.setSupport(item1, item2, currentSequenceID + 1);
            }
          }
        }

        // If a new item was seen
        // Add an extra row to the item-is-exist table that will be called -1  with all items in
        // this sequence
        if (seenNewItem) {
          PositionVector vector = new PositionVector(-1, (BitSet) currentBitset.clone());
          table.add(vector);
        }
        //
        //
        //				// Initialize the IE lists and SE lists for that sequence
        // which will be filled with the next database scan.
        sePositionList[currentSequenceID] = new SEPositionList(itemsAlreadySeen);
        iePositionList[currentSequenceID] = new IEPositionList();

        if (DEBUG) {
          System.out.println("Table for sequence " + currentSequenceID + " : " + thisLine);
          System.out.println(table.toString());
        }
        // put the current table in the array of item-is-exist-tables
        tables[currentSequenceID] = table;
        // we will process the next sequence id
        currentSequenceID++;
      }
      reader.close();
    } catch (Exception e) {
      e.printStackTrace();
    }

    // THIRD SCAN TO
    //  PARSE THE SEQUENCE FORWARD TO CREATE THE SE-POSITION LIST OF THAT SEQUENCE
    // AND IEPositionList for frequent 2-IE-SEQUENCES
    try {
      BufferedReader reader =
          new BufferedReader(new InputStreamReader(new FileInputStream(new File(input))));
      String thisLine;
      // For each sequence
      int currentSequenceID = 0;
      while ((thisLine = reader.readLine()) != null) {

        // We will scan the sequence backward, starting from the end.
        String[] tokens = thisLine.split(" ");
        // to keep the current itemset in memory
        List<Integer> currentItemset = new ArrayList<Integer>();

        // this variable will be used to remember which itemset we are visiting
        short itemsetID = 0;
        // empty the object to track the current itemset (if it was used for the previous sequence)
        currentItemset.clear();

        // for each token of the current sequence
        for (int i = 0; i < tokens.length; i++) {
          String token = tokens[i];

          // if we reached the end of an itemset
          if ("-1".equals(token)) {
            // if the current itemset contains more than one item
            if (currentItemset.size() > 1) {
              // update the position list for 2-IE-sequences
              for (int k = 0; k < currentItemset.size(); k++) {
                Integer item1 = currentItemset.get(k);
                for (int m = k + 1; m < currentItemset.size(); m++) {
                  Integer item2 = currentItemset.get(m);
                  // if the pair is frequent
                  int support = matrixPairCount.getSupportForItems(item1, item2);
                  if (support >= minsup) {
                    iePositionList[currentSequenceID].register(item1, item2, itemsetID);
                  }
                }
              }
            }
            // increase itemsetID
            itemsetID++;
            // clear itemset
            currentItemset.clear();
          } else if ("-2".equals(token)) {
            // if the end of a sequence, nothing special to do

          } else {
            // otherwise, the current token is an item
            Integer item = Integer.valueOf(token);
            // if the item is frequent
            if (mapItemFirstOccurrences.get(item).size() >= minsup) {
              // we add the current position to the item SE-position list
              sePositionList[currentSequenceID].register(item, itemsetID);
              // we add the item to the current itemset
              currentItemset.add(item);
            }
          }
        }

        if (DEBUG) {
          System.out.println("SE Position list for sequence " + currentSequenceID);
          System.out.println(sePositionList[currentSequenceID]);
          System.out.println("IE Position list for sequence " + currentSequenceID);
          System.out.println(iePositionList[currentSequenceID]);
        }

        iePositionList[currentSequenceID].sort(); // sort the IE-position list
        // update the sequence id for the next sequence
        currentSequenceID++;
      }
      reader.close();
    } catch (Exception e) {
      e.printStackTrace();
    }

    if (DEBUG) {
      System.out.println("=== Starting sequential pattern generation ===");
    }

    // For each frequent item,  call the recursive method to explore larger patterns
    for (int i = 0; i < frequentItems.size(); i++) {
      // Get the item
      int item1 = frequentItems.get(i);
      // Get the border for that item
      List<Position> item1Border = mapItemFirstOccurrences.get(item1);
      if (DEBUG) {
        System.out.println("=== Considering item " + item1);
        System.out.println("  Border of " + item1);
        for (Position pos : item1Border) {
          System.out.println("    seq: " + pos.sid + "    itemset: " + pos.position);
        }
      }
      // if the border contains at least minsup sequence (if the item is frequent)
      if (item1Border.size() >= minsup) {
        // Create an object prefix to represent the sequential pattern containing the item
        Prefix prefix = new Prefix();
        List<Integer> itemset = new ArrayList<Integer>(1);
        itemset.add(item1);
        prefix.itemsets.add(itemset);
        // make a recursive call to find s-extensions of this prefix
        genPatterns(
            prefix,
            item1Border,
            frequentItems,
            frequentItems,
            item1,
            true); // true, to disallow I-extension because we explore 2-IE sequences separately
      }

      // For each frequent 2-IE sequences stating with item1, we will explore 2-IE sequences
      // by considering each frequent item larger than item1
      for (int k = i + 1; k < frequentItems.size(); k++) {
        // We consider item2
        int item2 = frequentItems.get(k);
        // Get the support of item1, item2
        int support = matrixPairCount.getSupportForItems(item1, item2);

        // if the pair {item1, item2} is frequent
        if (support >= minsup) {
          // get the list of position of item2
          List<Position> item2Border = mapItemFirstOccurrences.get(item2);
          // Create the border by using the 2-IE position list
          List<Position> ie12Border = new ArrayList<Position>();

          // We will loop over the border of item1 or item2 (the smallest one)
          List<Position> borderToUse;
          if (item2Border.size() < item1Border.size()) {
            borderToUse = item2Border;
          } else {
            borderToUse = item1Border;
          }
          // For each sequence of the border that we consider
          for (Position sequenceToUse : borderToUse) {
            // Get the sequence id
            int sid = sequenceToUse.sid;
            // For this sequence, we will get the position list of each item
            List<Short> listPosition1 = sePositionList[sid].getListForItem(item1);
            List<Short> listPosition2 = sePositionList[sid].getListForItem(item2);
            // if one of them is null, that means that both item1 and item2 do not appear in that
            // sequence
            // so we continue to the next sequence
            if (listPosition1 == null || listPosition2 == null) {
              continue;
            }
            // otherwise
            // find the first common position of item1 and item2 in the sequence
            int index1 = 0;
            int index2 = 0;

            // we do that by the following while loop
            while (index1 < listPosition1.size() && index2 < listPosition2.size()) {
              short position1 = listPosition1.get(index1);
              short position2 = listPosition2.get(index2);
              if (position1 < position2) {
                index1++;
              } else if (position1 > position2) {
                index2++;
              } else {
                // we have found the position, so we add it to the new border and
                // then stop because we do not want to add more than one position for
                // the same sequence in the new border
                ie12Border.add(new Position(sid, position1));
                break;
              }
            }
          }
          if (DEBUG) {
            System.out.println(
                "=== Considering the 2-IE sequence {"
                    + item1
                    + ","
                    + item2
                    + "}  with support "
                    + support);
            System.out.println("  Border of {" + item1 + "," + item2 + "}");
            for (Position pos : ie12Border) {
              System.out.println("    seq: " + pos.sid + "    itemset: " + pos.position);
            }
          }

          // finally, we create the prefix for the pattern  {item1, item2}
          Prefix prefix = new Prefix();
          List<Integer> itemset = new ArrayList<Integer>(2);
          itemset.add(item1);
          itemset.add(item2);
          prefix.itemsets.add(itemset);
          // save the pattern
          savePattern(prefix, support);
          // perform recursive call to extend that pattern
          genPatterns(
              prefix,
              ie12Border,
              frequentItems,
              frequentItems,
              item2,
              false); // false, to allow I-extension
        }
      }
    }

    // Record the maximum memory usage
    MemoryLogger.getInstance().checkMemory();
    writer.close();
  }
Ejemplo n.º 27
0
  /**
   * This is the dfsPruning method as described in the SPAM paper.
   *
   * @param prefix the current prefix
   * @param prefixBitmap the bitmap corresponding to the current prefix
   * @param sn a list of items to be considered for i-steps
   * @param in a list of items to be considered for s-steps
   * @param hasToBeGreaterThanForIStep
   * @param m size of the current prefix in terms of items
   * @param lastAppendedItem the last appended item to the prefix
   * @throws IOException if there is an error writing a pattern to the output file
   * @return TRUE IF A FREQUENT PATTERN WAS CREATED USING THE PREFIX.
   */
  boolean dfsPruning(
      PrefixVMSP prefix,
      Bitmap prefixBitmap,
      List<Integer> sn,
      List<Integer> in,
      int hasToBeGreaterThanForIStep,
      int m,
      Integer lastAppendedItem)
      throws IOException {
    boolean atLeastOneFrequentExtension = false;
    //		System.out.println(prefix.toString());

    //  ======  S-STEPS ======
    // Temporary variables (as described in the paper)
    List<Integer> sTemp = new ArrayList<Integer>();
    List<Bitmap> sTempBitmaps = new ArrayList<Bitmap>();

    // for CMAP pruning, we will only check against the last appended item
    Map<Integer, Integer> mapSupportItemsAfter = coocMapAfter.get(lastAppendedItem);

    // for each item in sn
    loopi:
    for (Integer i : sn) {

      // LAST POSITION PRUNING
      /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) {
      //				System.out.println("TEST");
      continue loopi;
      }*/

      // CMAP PRUNING
      // we only check with the last appended item
      if (useCMAPPruning) {
        if (mapSupportItemsAfter == null) {
          continue loopi;
        }
        Integer support = mapSupportItemsAfter.get(i);
        if (support == null || support < minsup) {
          //							System.out.println("PRUNE");
          continue loopi;
        }
      }

      // perform the S-STEP with that item to get a new bitmap
      Bitmap.INTERSECTION_COUNT++;
      Bitmap newBitmap =
          prefixBitmap.createNewBitmapSStep(verticalDB.get(i), sequencesSize, lastBitIndex, maxGap);
      // if the support is higher than minsup
      if (newBitmap.getSupportWithoutGapTotal() >= minsup) {
        // record that item and pattern in temporary variables
        sTemp.add(i);
        sTempBitmaps.add(newBitmap);
      }
    }
    // for each pattern recorded for the s-step
    for (int k = 0; k < sTemp.size(); k++) {
      // STRATEGY: NEWWW
      atLeastOneFrequentExtension = true;

      int item = sTemp.get(k);
      // create the new prefix
      PrefixVMSP prefixSStep = prefix.cloneSequence();
      prefixSStep.addItemset(new Itemset(item));
      if (item % 2 == 0) {
        prefixSStep.sumOfEvenItems = item + prefix.sumOfEvenItems;
        prefixSStep.sumOfOddItems = prefix.sumOfOddItems;
      } else {
        prefixSStep.sumOfEvenItems = prefix.sumOfEvenItems;
        prefixSStep.sumOfOddItems = item + prefix.sumOfOddItems;
      }
      //            prefixSStep.sumOfItems = item + prefix.sumOfItems;

      // create the new bitmap
      Bitmap newBitmap = sTempBitmaps.get(k);

      // save the pattern to the file
      if (newBitmap.getSupport() >= minsup) {

        boolean hasFrequentExtension = false;
        // recursively try to extend that pattern
        if (maximumPatternLength > m) {
          hasFrequentExtension =
              dfsPruning(prefixSStep, newBitmap, sTemp, sTemp, item, m + 1, item);
        }

        if (hasFrequentExtension == false) {
          savePatternMultipleItems(prefixSStep, newBitmap, m);
        }
      }
    }

    Map<Integer, Integer> mapSupportItemsEquals = coocMapEquals.get(lastAppendedItem);
    // ========  I STEPS =======
    // Temporary variables
    List<Integer> iTemp = new ArrayList<Integer>();
    List<Bitmap> iTempBitmaps = new ArrayList<Bitmap>();

    // for each item in in
    loop2:
    for (Integer i : in) {

      // the item has to be greater than the largest item
      // already in the last itemset of prefix.
      if (i > hasToBeGreaterThanForIStep) {

        // LAST POSITION PRUNING
        /*if (useLastPositionPruning && lastItemPositionMap.get(i) < prefixBitmap.firstItemsetID) {
        continue loop2;
        }*/

        // CMAP PRUNING
        if (useCMAPPruning) {
          if (mapSupportItemsEquals == null) {
            continue loop2;
          }
          Integer support = mapSupportItemsEquals.get(i);
          if (support == null || support < minsup) {
            continue loop2;
          }
        }

        // Perform an i-step with this item and the current prefix.
        // This creates a new bitmap
        Bitmap.INTERSECTION_COUNT++;
        Bitmap newBitmap =
            prefixBitmap.createNewBitmapIStep(verticalDB.get(i), sequencesSize, lastBitIndex);
        // If the support is no less than minsup
        if (newBitmap.getSupport() >= minsup) {
          // record that item and pattern in temporary variables
          iTemp.add(i);
          iTempBitmaps.add(newBitmap);
        }
      }
    }
    // for each pattern recorded for the i-step
    for (int k = 0; k < iTemp.size(); k++) { // STRATEGY: NEWWW
      atLeastOneFrequentExtension = true;

      int item = iTemp.get(k);
      // create the new prefix
      PrefixVMSP prefixIStep = prefix.cloneSequence();
      prefixIStep.getItemsets().get(prefixIStep.size() - 1).addItem(item);
      if (item % 2 == 0) {
        prefixIStep.sumOfEvenItems = item + prefix.sumOfEvenItems;
        prefixIStep.sumOfOddItems = prefix.sumOfOddItems;
      } else {
        prefixIStep.sumOfEvenItems = prefix.sumOfEvenItems;
        prefixIStep.sumOfOddItems = item + prefix.sumOfOddItems;
      }
      // create the new bitmap
      Bitmap newBitmap = iTempBitmaps.get(k);

      // recursively try to extend that pattern
      boolean hasFrequentExtension = false;
      if (maximumPatternLength > m) {
        hasFrequentExtension = dfsPruning(prefixIStep, newBitmap, sTemp, iTemp, item, m + 1, item);
      }

      if (hasFrequentExtension == false) {
        // save the pattern
        savePatternMultipleItems(prefixIStep, newBitmap, m);
      }
    }
    // check the memory usage
    MemoryLogger.getInstance().checkMemory();

    return atLeastOneFrequentExtension || useStrategyForwardExtensionChecking == false;
  }
Ejemplo n.º 28
0
  /**
   * Run the algorithm
   *
   * @param input the input file path
   * @param output the output file path
   * @param minUtility the minimum utility threshold
   * @throws IOException exception if error while writing the file
   */
  public void runAlgorithm(String input, String output, int minUtility) throws IOException {
    // reset maximum
    MemoryLogger.getInstance().reset();

    // initialize the buffer for storing the current itemset
    itemsetBuffer = new int[BUFFERS_SIZE];

    mapFMAP = new HashMap<Integer, Map<Integer, Long>>();

    startTimestamp = System.currentTimeMillis();

    writer = new BufferedWriter(new FileWriter(output));

    //  We create a  map to store the TWU of each item
    mapItemToTWU = new HashMap<Integer, Long>();

    // We scan the database a first time to calculate the TWU of each item.
    BufferedReader myInput = null;
    String thisLine;
    try {
      // prepare the object for reading the file
      myInput = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input))));
      // for each line (transaction) until the end of file
      while ((thisLine = myInput.readLine()) != null) {
        // if the line is  a comment, is  empty or is a
        // kind of metadata
        if (thisLine.isEmpty() == true
            || thisLine.charAt(0) == '#'
            || thisLine.charAt(0) == '%'
            || thisLine.charAt(0) == '@') {
          continue;
        }

        // split the transaction according to the : separator
        String split[] = thisLine.split(":");
        // the first part is the list of items
        String items[] = split[0].split(" ");
        // the second part is the transaction utility
        int transactionUtility = Integer.parseInt(split[1]);
        // for each item, we add the transaction utility to its TWU
        for (int i = 0; i < items.length; i++) {
          // convert item to integer
          Integer item = Integer.parseInt(items[i]);
          // get the current TWU of that item
          Long twu = mapItemToTWU.get(item);
          // add the utility of the item in the current transaction to its twu
          twu = (twu == null) ? transactionUtility : twu + transactionUtility;
          mapItemToTWU.put(item, twu);
        }
      }
    } catch (Exception e) {
      // catches exception if error while reading the input file
      e.printStackTrace();
    } finally {
      if (myInput != null) {
        myInput.close();
      }
    }

    // CREATE A LIST TO STORE THE UTILITY LIST OF ITEMS WITH TWU  >= MIN_UTILITY.
    List<UtilityList> listOfUtilityLists = new ArrayList<UtilityList>();
    // CREATE A MAP TO STORE THE UTILITY LIST FOR EACH ITEM.
    // Key : item    Value :  utility list associated to that item
    Map<Integer, UtilityList> mapItemToUtilityList = new HashMap<Integer, UtilityList>();

    // For each item
    for (Integer item : mapItemToTWU.keySet()) {
      // if the item is promising  (TWU >= minutility)
      if (mapItemToTWU.get(item) >= minUtility) {
        // create an empty Utility List that we will fill later.
        UtilityList uList = new UtilityList(item);
        mapItemToUtilityList.put(item, uList);
        // add the item to the list of high TWU items
        listOfUtilityLists.add(uList);
      }
    }
    // SORT THE LIST OF HIGH TWU ITEMS IN ASCENDING ORDER
    Collections.sort(
        listOfUtilityLists,
        new Comparator<UtilityList>() {
          public int compare(UtilityList o1, UtilityList o2) {
            // compare the TWU of the items
            return compareItems(o1.item, o2.item);
          }
        });

    // SECOND DATABASE PASS TO CONSTRUCT THE UTILITY LISTS
    // OF 1-ITEMSETS  HAVING TWU  >= minutil (promising items)
    try {
      // prepare object for reading the file
      myInput = new BufferedReader(new InputStreamReader(new FileInputStream(new File(input))));
      // variable to count the number of transaction
      int tid = 0;
      // for each line (transaction) until the end of file
      while ((thisLine = myInput.readLine()) != null) {
        // if the line is  a comment, is  empty or is a
        // kind of metadata
        if (thisLine.isEmpty() == true
            || thisLine.charAt(0) == '#'
            || thisLine.charAt(0) == '%'
            || thisLine.charAt(0) == '@') {
          continue;
        }

        // split the line according to the separator
        String split[] = thisLine.split(":");
        // get the list of items
        String items[] = split[0].split(" ");
        // get the list of utility values corresponding to each item
        // for that transaction
        String utilityValues[] = split[2].split(" ");

        // Copy the transaction into lists but
        // without items with TWU < minutility

        int remainingUtility = 0;

        long newTWU = 0; // NEW OPTIMIZATION

        // Create a list to store items
        List<Pair> revisedTransaction = new ArrayList<Pair>();
        // for each item
        for (int i = 0; i < items.length; i++) {
          /// convert values to integers
          Pair pair = new Pair();
          pair.item = Integer.parseInt(items[i]);
          pair.utility = Integer.parseInt(utilityValues[i]);
          // if the item has enough utility
          if (mapItemToTWU.get(pair.item) >= minUtility) {
            // add it
            revisedTransaction.add(pair);
            remainingUtility += pair.utility;
            newTWU += pair.utility; // NEW OPTIMIZATION
          }
        }

        // sort the transaction
        Collections.sort(
            revisedTransaction,
            new Comparator<Pair>() {
              public int compare(Pair o1, Pair o2) {
                return compareItems(o1.item, o2.item);
              }
            });

        // for each item left in the transaction
        for (int i = 0; i < revisedTransaction.size(); i++) {
          Pair pair = revisedTransaction.get(i);

          //					int remain = remainingUtility; // FOR OPTIMIZATION

          // subtract the utility of this item from the remaining utility
          remainingUtility = remainingUtility - pair.utility;

          // get the utility list of this item
          UtilityList utilityListOfItem = mapItemToUtilityList.get(pair.item);

          // Add a new Element to the utility list of this item corresponding to this transaction
          Element element = new Element(tid, pair.utility, remainingUtility);

          utilityListOfItem.addElement(element);

          // BEGIN NEW OPTIMIZATION for FHM
          Map<Integer, Long> mapFMAPItem = mapFMAP.get(pair.item);
          if (mapFMAPItem == null) {
            mapFMAPItem = new HashMap<Integer, Long>();
            mapFMAP.put(pair.item, mapFMAPItem);
          }

          for (int j = i + 1; j < revisedTransaction.size(); j++) {
            Pair pairAfter = revisedTransaction.get(j);
            Long twuSum = mapFMAPItem.get(pairAfter.item);
            if (twuSum == null) {
              mapFMAPItem.put(pairAfter.item, newTWU);
            } else {
              mapFMAPItem.put(pairAfter.item, twuSum + newTWU);
            }
          }
          // END OPTIMIZATION of FHM
        }
        tid++; // increase tid number for next transaction
      }
    } catch (Exception e) {
      // to catch error while reading the input file
      e.printStackTrace();
    } finally {
      if (myInput != null) {
        myInput.close();
      }
    }

    // check the memory usage
    MemoryLogger.getInstance().checkMemory();

    // Mine the database recursively
    fhm(itemsetBuffer, 0, null, listOfUtilityLists, minUtility);

    // check the memory usage again and close the file.
    MemoryLogger.getInstance().checkMemory();
    // close output file
    writer.close();
    // record end time
    endTimestamp = System.currentTimeMillis();
  }
Ejemplo n.º 29
0
  /**
   * This is the main method for the BIDE+ algorithm.
   *
   * @param database a sequence database
   * @throws IOException exception if some error occurs while writing the output file.
   */
  private void bide(SequenceDatabase database, String outputFilePath) throws IOException {
    // if the user want to keep the result into memory
    if (outputFilePath == null) {
      writer = null;
      patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS");
    } else { // if the user want to save the result to a file
      patterns = null;
      writer = new BufferedWriter(new FileWriter(outputFilePath));
    }

    // The algorithm first scan the database to find all frequent items
    // The algorithm note the sequences in which these items appear.
    // This is stored in a map:  Key: item  Value : IDs of sequences containing the item
    Map<Integer, Set<Integer>> mapSequenceID = findSequencesContainingItems(database);

    // WE CONVERT THE DATABASE TO A PSEUDO-DATABASE, AND REMOVE
    // THE ITEMS OF SIZE 1 THAT ARE NOT FREQUENT, SO THAT THE ALGORITHM
    // WILL NOT CONSIDER THEM ANYMORE.

    // OPTIMIZATION Create COOC MAP
    //		coocMapBefore = new HashMap<Integer, Map<Integer,
    // Integer>>(mapSequenceID.entrySet().size());

    // we create a database
    initialDatabase = new ArrayList<PseudoSequenceBIDE>();
    // for each sequence of the original database
    for (Sequence sequence : database.getSequences()) {
      // we make a copy of the sequence while removing infrequent items
      Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute);
      if (optimizedSequence.size() != 0) {
        // if this sequence has size >0, we add it to the new database
        initialDatabase.add(new PseudoSequenceBIDE(optimizedSequence, 0, 0));
      }

      //			// update COOC map
      //			HashSet<Integer> alreadySeen = new HashSet<Integer>();
      //			for(List<Integer> itemset : optimizedSequence.getItemsets()) {
      //				for(Integer item : itemset) {
      //					Map<Integer, Integer> mapCoocItem = coocMapBefore.get(item);
      //					if(mapCoocItem == null) {
      //						mapCoocItem = new HashMap<Integer, Integer>();
      //						coocMapBefore.put(item, mapCoocItem);
      //					}
      //					for(Integer itemSeen : alreadySeen) {
      //						if(itemSeen != item) {
      //							Integer frequency = mapCoocItem.get(itemSeen);
      //							if(frequency == null) {
      //								mapCoocItem.put(itemSeen, 1);
      //							}else {
      //								mapCoocItem.put(itemSeen, frequency+1);
      //							}
      //						}
      //					}
      //					alreadySeen.add(item);
      //				}
      //			}
    }

    // For each frequent item
    loop1:
    for (Entry<Integer, Set<Integer>> entry : mapSequenceID.entrySet()) {
      // if the item is frequent
      if (entry.getValue().size() >= minsuppAbsolute) {
        //				Map<Integer, Integer> mapCoocItem = coocMapBefore.get(entry.getKey());
        //				if(mapCoocItem != null) {
        //					for(Integer supportCoocBefore : mapCoocItem.values()) {
        //						if(supportCoocBefore >= entry.getValue().size()) {
        //							continue loop1;
        //						}
        //					}
        //				}

        // build the projected database with this item
        Integer item = entry.getKey();
        List<PseudoSequenceBIDE> projectedContext =
            buildProjectedContextSingleItem(item, initialDatabase, false, entry.getValue());

        // Create the prefix with this item
        SequentialPattern prefix = new SequentialPattern();
        prefix.addItemset(new Itemset(item));
        // set the sequence IDS of this prefix
        prefix.setSequenceIDs(entry.getValue());

        // variable to store the largest support of patterns
        // that will be found starting with this prefix
        if (projectedContext.size() >= minsuppAbsolute) {
          int successorSupport = 0;

          if (!checkBackScanPruning(prefix, entry.getValue())) {
            successorSupport = recursion(prefix, projectedContext); // récursion;
          }

          // Finally, because this prefix has support > minsup
          // and passed the backscan pruning,
          // we check if it has no sucessor with support >= minsup
          // (a forward extension)
          // IF no forward extension
          if (successorSupport != entry.getValue().size()) { // ######### MODIFICATION ####
            // IF there is also no backward extension
            if (!checkBackwardExtension(prefix, entry.getValue())) {
              // the pattern is closed and we save it
              savePattern(prefix);
            }
          }
        } else {
          if (!checkBackwardExtension(prefix, entry.getValue())) {
            // the pattern is closed and we save it
            savePattern(prefix);
          }
        }
      }
    }
    // check the memory usage for statistics
    MemoryLogger.getInstance().checkMemory();
  }
Ejemplo n.º 30
0
  /**
   * The main recursive method of LAPIN
   *
   * @param prefix the current prefix
   * @param prefix the prefix
   * @param prefixBorder a list of position that is the prefix border
   * @param in items that could be appended by i-extension
   * @param sn items that could be appended by s-extension
   * @param hasToBeGreaterThanForIStep
   * @throws IOException if error while writing to file
   */
  private void genPatterns(
      Prefix prefix,
      List<Position> prefixBorder,
      List<Integer> sn,
      List<Integer> in,
      int hasToBeGreaterThanForIStep,
      boolean doNotPerformIExtensions)
      throws IOException {
    //			if(DEBUG) {
    //				if(seqDB == null) {
    //					seqDB = new SequenceDatabase();
    //					seqDB.loadFile(input);
    //				}
    //				// FOR DEBUGGING  = WORK ONLY FOR SEQUENCE WITH SINGLE ITEMS IN EACH ITEMSET
    //				System.out.println("Checking if the border of " + prefix + " is correct");
    //				for(Position pos : prefixBorder) {
    //					int sid = pos.sid;
    //					Sequence seq = seqDB.getSequences().get(sid);
    //					int calculatedPosition = 0;
    //
    //					int prefixItemsetID =0;
    //					for(; calculatedPosition< seq.size(); calculatedPosition++ ) {
    //						Integer[] itemset = seq.get(calculatedPosition);
    //						Integer itemToMatch = prefix.itemsets.get(prefixItemsetID).get(0);
    //						if(itemset[0].equals(itemToMatch)) {
    //							prefixItemsetID++;
    //							if(prefixItemsetID == prefix.size()) {
    //								if(pos.position != calculatedPosition) {
    //									System.out.println("THE BORDER IS WRONG FOR PREFIX " + prefix + " AND SEQUENCE :" +
    // sid + " " + seq);
    //									System.out.println();
    //								}else {
    //									System.out.println("THE BORDER IS OK");
    //									break;
    //								}
    //							}
    //						}
    //					}
    //				}
    ////			}

    //  ======  S-STEPS ======
    //			// Temporary variables (as described in the paper)
    List<Integer> sTemp = new ArrayList<Integer>();
    List<Integer> sTempSupport = new ArrayList<Integer>();
    //
    //			// for each item in sn
    for (Integer item : sn) {
      // perform the S-STEP
      int support = calculateSupportSStep(item, prefixBorder);
      // if the support is higher than minsup
      if (support >= minsup) {
        //					// record that item and pattern in temporary variables
        sTemp.add(item);
        sTempSupport.add(support);
      }
    }
    // for each pattern recorded for the s-step
    for (int k = 0; k < sTemp.size(); k++) {
      int item = sTemp.get(k);
      // create the new prefix
      Prefix prefixSStep = prefix.cloneSequence();
      List<Integer> itemset = new ArrayList<Integer>(1);
      itemset.add(item);
      prefixSStep.itemsets.add(itemset);

      // save the pattern to the file
      savePattern(prefixSStep, sTempSupport.get(k));

      // recursively try to extend that pattern
      List<Position> newBorder = recalculateBorderForSExtension(prefixBorder, item);

      // Recursive call
      genPatterns(prefixSStep, newBorder, sTemp, sTemp, item, false);
    }

    if (doNotPerformIExtensions) {
      return;
    }

    // ========  I STEPS =======
    // Temporary variables
    List<Integer> iTemp = new ArrayList<Integer>();
    List<List<Position>> iTempBorder = new ArrayList<List<Position>>();
    //
    //			// for each item in in
    // the item has to be greater than the largest item
    // already in the last itemset of prefix.
    int index = Collections.binarySearch(in, hasToBeGreaterThanForIStep);
    for (int i = index; i < in.size(); i++) {
      Integer item = in.get(i);

      List<Integer> lastItemset = prefix.itemsets.get(prefix.itemsets.size() - 1);
      //				Integer lastItem = lastItemset.get(lastItemset.size()-1);
      boolean willAddSecondItem = lastItemset.size() == 1;

      // AN OPTIMIZATION

      // perform the I-STEP
      int support = estimateSupportIStep(item, prefixBorder);

      // if the estimated support is higher than minsup
      if (support >= minsup) {

        // recalculate the border
        // in this case, the method takes the prefix border as input
        List<Position> newBorder =
            recalculateBorderForIExtension(
                lastItemset, prefixBorder, hasToBeGreaterThanForIStep, item, willAddSecondItem);

        // record that item and pattern in temporary variables
        if (newBorder.size() >= minsup) {
          iTemp.add(item);
          iTempBorder.add(newBorder);
        }
      }
    }

    // for each pattern recorded for the i-step
    for (int k = 0; k < iTemp.size(); k++) {
      int item = iTemp.get(k);
      // create the new prefix
      Prefix prefixIStep = prefix.cloneSequence();
      prefixIStep.itemsets.get(prefixIStep.size() - 1).add(item);

      // save the pattern
      List<Position> newBorder = iTempBorder.get(k);
      savePattern(prefixIStep, newBorder.size());
      // recursively try to extend that pattern
      genPatterns(prefixIStep, newBorder, sTemp, iTemp, item, false);
    }

    // check the memory usage
    MemoryLogger.getInstance().checkMemory();
  }