   * This method calculate the frequency of each item in one database pass. Then it remove all items
   * that are not frequent.
   * @param database : a sequence database
   * @return A map such that key = item value = a map where a key = tid and a value = Occurence This
   *     map allows knowing the frequency of each item and their first and last occurence in each
   *     sequence.
  private Map<String, Map<Integer, Occurence>> removeItemsThatAreNotFrequent(
      SequenceDatabase database) {
    // (1) Count the support of each item in the database in one database pass
    mapItemCount = new HashMap<String, Map<Integer, Occurence>>(); // <item, Map<tid, occurence>>

    // for each sequence
    for (Sequence sequence : database.getSequences()) {
      // for each itemset
      for (short j = 0; j < sequence.getItemsets().size(); j++) {
        List<String> itemset = sequence.get(j);
        // for each item
        for (int i = 0; i < itemset.size(); i++) {
          String itemI = itemset.get(i);
          Map<Integer, Occurence> occurences = mapItemCount.get(itemI);
          if (occurences == null) {
            occurences = new HashMap<Integer, Occurence>();
            mapItemCount.put(itemI, occurences);
          Occurence occurence = occurences.get(sequence.getId());
          if (occurence == null) {
            occurence = new Occurence(sequence.getId());
            occurences.put(sequence.getId(), occurence);
    //		System.out.println("NUMBER OF DIFFERENT ITEMS : " + mapItemCount.size());
    // (2) remove all items that are not frequent from the database
    for (Sequence sequence : database.getSequences()) {
      int i = 0;
      while (i < sequence.getItemsets().size()) {
        List<String> itemset = sequence.getItemsets().get(i);
        int j = 0;
        while (j < itemset.size()) {
          double count = mapItemCount.get(itemset.get(j)).size();

          if (count < minsuppRelative) {
          } else {
    return mapItemCount;
  * For each item, calculate the sequence id of sequences containing that item
  * @param database the current sequence database
  * @return Map of items to sequence IDs that contains each item
 private Map<String, Set<Integer>> findSequencesContainingItems(SequenceDatabase contexte) {
   // We use a map to store the sequence IDs where an item appear
   // Key : item   Value :  a set of sequence IDs
   Map<String, Set<Integer>> mapSequenceID =
       new HashMap<
           String, Set<Integer>>(); // pour conserver les ID des séquences: <Id Item, Set d'id de
   // séquences>
   // for each sequence in the current database
   for (Sequence sequence : contexte.getSequences()) {
     // for each itemset in this sequence
     for (List<String> itemset : sequence.getItemsets()) {
       // for each item
       for (String item : itemset) {
         // get the set of sequence IDs for this item until now
         Set<Integer> sequenceIDs = mapSequenceID.get(item);
         if (sequenceIDs == null) {
           // if the set does not exist, create one
           sequenceIDs = new HashSet<Integer>();
           mapSequenceID.put(item, sequenceIDs);
         // add the sequence ID of the current sequence to the
         // set of sequences IDs of this item
         //					}
   return mapSequenceID;
   * This is the main method for the PrefixSpan algorithm that is called to start the algorithm
   * @param outputFilePath an output file path if the result should be saved to a file or null if
   *     the result should be saved to memory.
   * @param database a sequence database
   * @throws IOException exception if an error while writing the output file
  private void prefixSpan(SequenceDatabase database, String outputFilePath) throws IOException {
    // if the user want to keep the result into memory
    if (outputFilePath == null) {
      writer = null;
      patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS");
    } else { // if the user want to save the result to a file
      patterns = null;
      writer = new BufferedWriter(new FileWriter(outputFilePath));

    // We have to scan the database to find all frequent patterns of size 1.
    // We note the sequences in which these patterns appear.
    Map<String, Set<Integer>> mapSequenceID = findSequencesContainingItems(database);


    // Create a list of pseudosequence
    List<PseudoSequence> initialContext = new ArrayList<PseudoSequence>();
    // for each sequence in  the database
    for (Sequence sequence : database.getSequences()) {
      // remove infrequent items
      Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute);
      if (optimizedSequence.size() != 0) {
        // if the size is > 0, create a pseudo sequence with this sequence
        initialContext.add(new PseudoSequence(optimizedSequence, 0, 0));

    // For each item
    for (Entry<String, Set<Integer>> entry : mapSequenceID.entrySet()) {
      // if the item is frequent  (has a support >= minsup)
      if (entry.getValue().size() >= minsuppAbsolute) { // if the item is frequent
        // build the projected context
        String item = entry.getKey();
        List<PseudoSequence> projectedContext = buildProjectedContext(item, initialContext, false);

        // Create the prefix for the projected context.
        SequentialPattern prefix = new SequentialPattern(0);
        prefix.addItemset(new Itemset(item));

        // The prefix is a frequent sequential pattern.
        // We save it in the result.
        savePattern(prefix); // we found a sequence.

        // Recursive call !
        recursion(prefix, projectedContext);
   * This method search for items for expanding left side of a rule I --> J with any item c. This
   * results in rules of the form I --> J U�{c}. The method makes sure that: - c is not already
   * included in I or J - c appear at least minsup time in tidsIJ after the first occurence of I - c
   * is lexically bigger than all items in J
   * @param mapWindowsJI
   * @throws IOException
  private void expandRight(
      String[] itemsetI,
      String[] itemsetJ,
      Set<Integer> tidsI,
      Collection<Integer> tidsJ,
      Collection<Integer> tidsIJ // ,
      //    						Map<Integer, Occurence> occurencesI,
      //    						Map<Integer, Occurence> occurencesJ
      ) throws IOException {

    //    	// map-key: item   map-value: set of tids containing the item
    Map<String, Set<Integer>> frequentItemsC = new HashMap<String, Set<Integer>>();

    // for each sequence containing I-->J
    for (Integer tid : tidsIJ) {
      Sequence sequence = database.getSequences().get(tid);

      LinkedHashMap<String, Integer> mapMostRightFromI = new LinkedHashMap<String, Integer>();
      LinkedHashMap<String, Integer> mapMostRightFromJ = new LinkedHashMap<String, Integer>();
      LinkedHashMap<String, LinkedList<Integer>> mapMostLeftFromI =
          new LinkedHashMap<String, LinkedList<Integer>>();

      int lastItemsetScannedForC = Integer.MIN_VALUE;

      // For each itemset starting from the first...
      int k = 0;
      do {
        final int firstElementOfWindow = k - windowSize + 1;
        int lastElementOfWindow = k;

        // remove items from I that fall outside the time window
        int previousISize = mapMostRightFromI.size();
        removeElementOutsideWindowER(mapMostRightFromI, firstElementOfWindow);
        // important: if I was all there, but become smaller we need to clear the
        // hashmap for items of J.
        int currentISize = mapMostRightFromI.size();
        if (previousISize == itemsetJ.length && previousISize != currentISize) {

        // remove items from J that fall outside the time window
        removeElementOutsideWindowER(mapMostRightFromJ, firstElementOfWindow);

        // For each item of the current itemset
        for (String item : sequence.get(k)) {
          // record the first position until now of each item in I or J
          if (mapMostRightFromI.size() == itemsetI.length && contains(itemsetJ, item)) {
            addToLinked(mapMostRightFromJ, item, k);
          } else if (contains(itemsetI, item)) {
            addToLinked(mapMostRightFromI, item, k);
            LinkedList<Integer> list = mapMostLeftFromI.get(item);
            if (list == null) {
              list = new LinkedList<Integer>();
              addToLinked(mapMostLeftFromI, item, list);

        // if all the items of IJ are in the current window
        if (mapMostRightFromI.size() == itemsetI.length
            && mapMostRightFromJ.size() == itemsetJ.length) {

          // remove items from mostLeft that fall outside the time window.
          // at the same time, calculate the minimum index for items of I.
          int minimum = 1;
          for (LinkedList<Integer> list : mapMostLeftFromI.values()) {
            while (true) {
              Integer last = list.getLast();
              if (last < firstElementOfWindow) {
              } else {
                if (last > minimum) {
                  minimum = last + 1;

          // we need to scan for items C to extend the rule...
          // Such item c has to appear in the window before the last occurence of J (before
          // "minimum")
          // and if it was scanned before, it should not be scanned again.
          int itemsetC = minimum;
          if (itemsetC < lastItemsetScannedForC) {
            itemsetC = lastItemsetScannedForC + 1;

          for (; itemsetC <= lastElementOfWindow; itemsetC++) {
            for (String itemC : sequence.get(itemsetC)) {
              //    	    						if lexical order is not respected or c is included in the rule
              // already.
              if (containsLEX(itemsetI, itemC) || containsLEXPlus(itemsetJ, itemC)) {
              Set<Integer> tidsItemC = frequentItemsC.get(itemC);
              if (tidsItemC == null) {
                tidsItemC = new HashSet<Integer>();
                frequentItemsC.put(itemC, tidsItemC);
          lastItemsetScannedForC = lastElementOfWindow;
      } while (k < sequence.size() && lastItemsetScannedForC < sequence.size() - 1);

    // for each item c found, we create a rule
    for (Entry<String, Set<Integer>> entry : frequentItemsC.entrySet()) {
      Set<Integer> tidsI_JC = entry.getValue();

      // if the support is enough      Sup(R)  =  sup(IC -->J)
      if (tidsI_JC.size() >= minsuppRelative) {
        String itemC = entry.getKey();
        String[] itemsetJC = new String[itemsetJ.length + 1];
        System.arraycopy(itemsetJ, 0, itemsetJC, 0, itemsetJ.length);
        itemsetJC[itemsetJ.length] = itemC;
        //     			Itemset itemsetJC = new Itemset(ruleIJ.getItemset2());
        // 				itemsetJC.addItem(itemC);

        Set<Integer> tidsJC = new HashSet<Integer>();
        for (Integer tid : tidsJ) {
          Sequence sequence = database.getSequences().get(tid);
          // MAP: item : itemset index
          LinkedHashMap<String, Integer> mapAlreadySeenFromJC =
              new LinkedHashMap<String, Integer>();

          // For each itemset
          for (int k = 0; k < sequence.size(); k++) {
            // For each item
            for (String item : sequence.get(k)) {
              if (contains(itemsetJC, item)) { // record the last position of each item in JC
                addToLinked(mapAlreadySeenFromJC, item, k);
            // remove items that fall outside the time window
            Iterator<Entry<String, Integer>> iter = mapAlreadySeenFromJC.entrySet().iterator();
            while (iter.hasNext()) {
              Entry<String, Integer> entryMap = iter.next();
              if (entryMap.getValue() < k - windowSize + 1) {
              } else {
            // if all the items of I are inside the current window, then record the tid
            if (mapAlreadySeenFromJC.keySet().size() == itemsetJC.length) {
              continue loop1;
        // ----  ----

        // Create rule and calculate its confidence:  Conf(r) = sup(I-->JC) /  sup(I)
        double confI_JC = ((double) tidsI_JC.size()) / tidsI.size();
        //				Rule ruleI_JC = new Rule(ruleIJ.getItemset1(), itemsetJC, confI_JC, tidsI_JC.size());

        // if the confidence is enough
        if (confI_JC >= minconf) {
          saveRule(tidsI_JC, confI_JC, itemsetI, itemsetJC);

        expandRight(itemsetI, itemsetJC, tidsI, tidsJC, tidsI_JC); //

        // recursive call to expand left and right side of the rule
        expandLeft(itemsetI, itemsetJC, tidsI, tidsI_JC); // occurencesJ