コード例 #1
ファイル: AlgoBIDEPlus.java プロジェクト: vikasmb/DataZoomer
  * For each item, calculate the sequence id of sequences containing that item
  * @param database the current sequence database
  * @return Map of items to sequence IDs that contains each item
 private Map<Integer, Set<Integer>> findSequencesContainingItems(SequenceDatabase database) {
   // We use a map to store the sequence IDs where an item appear
   // Key : item   Value :  a set of sequence IDs
   Map<Integer, Set<Integer>> mapSequenceID =
       new HashMap<
               Integer>>(); // pour conserver les ID des séquences: <Id Item, Set d'id de
                            // séquences>
   // for each sequence
   for (Sequence sequence : database.getSequences()) {
     // for each itemset in that sequence
     for (List<Integer> itemset : sequence.getItemsets()) {
       // for each item
       for (Integer item : itemset) {
         // get the set of sequence ids for that item
         Set<Integer> sequenceIDs = mapSequenceID.get(item);
         if (sequenceIDs == null) {
           // if null create a new set
           sequenceIDs = new HashSet<Integer>();
           mapSequenceID.put(item, sequenceIDs);
         // add the current sequence id to this set
   return mapSequenceID;
コード例 #2
ファイル: Utility.java プロジェクト: pichitpr/Trimmed_SPMF
 public static SequenceDatabase load(List<List<Integer>>[] aryListDB) {
   SequenceDatabase db = new SequenceDatabase();
   Sequence seq;
   for (int i = 0; i < aryListDB.length; i++) {
     if (aryListDB[i] != null) {
       seq = new Sequence(i);
       for (List<Integer> itemset : aryListDB[i]) {
   return db;
コード例 #3
ファイル: Utility.java プロジェクト: pichitpr/Trimmed_SPMF
 public static SequenceDatabase load(String strDB) {
   SequenceDatabase db = new SequenceDatabase();
   Sequence seq;
   List<Integer> iset;
   String[] sequences = strDB.split("\\n");
   String[] itemsets;
   String[] items;
   for (String seqStr : sequences) {
     itemsets = seqStr.trim().split("\\s*\\|\\s*");
     seq = new Sequence(Integer.valueOf(itemsets[0]));
     for (int i = 1; i < itemsets.length; i++) {
       items = itemsets[i].split("\\s+");
       iset = new ArrayList<Integer>();
       for (String itemStr : items) {
   return db;
コード例 #4
ファイル: AlgoBIDEPlus.java プロジェクト: vikasmb/DataZoomer
   * This is the main method for the BIDE+ algorithm.
   * @param database a sequence database
   * @throws IOException exception if some error occurs while writing the output file.
  private void bide(SequenceDatabase database, String outputFilePath) throws IOException {
    // if the user want to keep the result into memory
    if (outputFilePath == null) {
      writer = null;
      patterns = new SequentialPatterns("FREQUENT SEQUENTIAL PATTERNS");
    } else { // if the user want to save the result to a file
      patterns = null;
      writer = new BufferedWriter(new FileWriter(outputFilePath));

    // The algorithm first scan the database to find all frequent items
    // The algorithm note the sequences in which these items appear.
    // This is stored in a map:  Key: item  Value : IDs of sequences containing the item
    Map<Integer, Set<Integer>> mapSequenceID = findSequencesContainingItems(database);


    //		coocMapBefore = new HashMap<Integer, Map<Integer,
    // Integer>>(mapSequenceID.entrySet().size());

    // we create a database
    initialDatabase = new ArrayList<PseudoSequenceBIDE>();
    // for each sequence of the original database
    for (Sequence sequence : database.getSequences()) {
      // we make a copy of the sequence while removing infrequent items
      Sequence optimizedSequence = sequence.cloneSequenceMinusItems(mapSequenceID, minsuppAbsolute);
      if (optimizedSequence.size() != 0) {
        // if this sequence has size >0, we add it to the new database
        initialDatabase.add(new PseudoSequenceBIDE(optimizedSequence, 0, 0));

      //			// update COOC map
      //			HashSet<Integer> alreadySeen = new HashSet<Integer>();
      //			for(List<Integer> itemset : optimizedSequence.getItemsets()) {
      //				for(Integer item : itemset) {
      //					Map<Integer, Integer> mapCoocItem = coocMapBefore.get(item);
      //					if(mapCoocItem == null) {
      //						mapCoocItem = new HashMap<Integer, Integer>();
      //						coocMapBefore.put(item, mapCoocItem);
      //					}
      //					for(Integer itemSeen : alreadySeen) {
      //						if(itemSeen != item) {
      //							Integer frequency = mapCoocItem.get(itemSeen);
      //							if(frequency == null) {
      //								mapCoocItem.put(itemSeen, 1);
      //							}else {
      //								mapCoocItem.put(itemSeen, frequency+1);
      //							}
      //						}
      //					}
      //					alreadySeen.add(item);
      //				}
      //			}

    // For each frequent item
    for (Entry<Integer, Set<Integer>> entry : mapSequenceID.entrySet()) {
      // if the item is frequent
      if (entry.getValue().size() >= minsuppAbsolute) {
        //				Map<Integer, Integer> mapCoocItem = coocMapBefore.get(entry.getKey());
        //				if(mapCoocItem != null) {
        //					for(Integer supportCoocBefore : mapCoocItem.values()) {
        //						if(supportCoocBefore >= entry.getValue().size()) {
        //							continue loop1;
        //						}
        //					}
        //				}

        // build the projected database with this item
        Integer item = entry.getKey();
        List<PseudoSequenceBIDE> projectedContext =
            buildProjectedContextSingleItem(item, initialDatabase, false, entry.getValue());

        // Create the prefix with this item
        SequentialPattern prefix = new SequentialPattern();
        prefix.addItemset(new Itemset(item));
        // set the sequence IDS of this prefix

        // variable to store the largest support of patterns
        // that will be found starting with this prefix
        if (projectedContext.size() >= minsuppAbsolute) {
          int successorSupport = 0;

          if (!checkBackScanPruning(prefix, entry.getValue())) {
            successorSupport = recursion(prefix, projectedContext); // récursion;

          // Finally, because this prefix has support > minsup
          // and passed the backscan pruning,
          // we check if it has no sucessor with support >= minsup
          // (a forward extension)
          // IF no forward extension
          if (successorSupport != entry.getValue().size()) { // ######### MODIFICATION ####
            // IF there is also no backward extension
            if (!checkBackwardExtension(prefix, entry.getValue())) {
              // the pattern is closed and we save it
        } else {
          if (!checkBackwardExtension(prefix, entry.getValue())) {
            // the pattern is closed and we save it
    // check the memory usage for statistics
コード例 #5
   * This method generates statistics for a sequence database (a file)
   * @param path the path to the file
   * @throws IOException exception if there is a problem while reading the file.
  public void getStats(String path) throws IOException {

    //  (1) First we will read the sequence database into memory.
    // (actually, we don't really need to read it into memory because it
    //  just require a single pass, but the code is more simple like that
    //  - it could be optimized, if necessary).

    List<Sequence> sequences =
        new ArrayList<Sequence>(); //  A sequence database is stored as a list of sequences
    int maxItem = 0; // the largest id for items in the database

    String thisLine; // a temporary variable to read each line from the file

    BufferedReader myInput = null;
    try {
      // we read the file line by line
      FileInputStream fin = new FileInputStream(new File(path));
      myInput = new BufferedReader(new InputStreamReader(fin));
      int i = 0; // used to count the lines.

      // for each line until the end of the file
      while ((thisLine = myInput.readLine()) != null) {
        // we split the line according to spaces into tokens
        String tokens[] = thisLine.split(" ");
        // we create a new sequence object to store the sequence that correspond to this line.
        Sequence sequence = new Sequence(i++);
        // we create a list of integer to store the current itemset from the sequence
        // that correspond to this line.
        List<Integer> itemset = new ArrayList<Integer>();
        // For each token
        for (String token : tokens) {
          // if the token starts with "<" it means that it is a timestamp
          if (token.codePointAt(0) == '<') {
            // we just ignore it for statistics..
          // if the token is "-1" it means that it is the end of an itemset
          else if (token.equals("-1")) {
            // we add the itemset to the sequence
            // we reset the variable itemset to read the next itemset
            itemset = new ArrayList<Integer>();
          // if the token is "-2", it indicates the end of this sequence and the
          // end of the line
          else if (token.equals("-2")) {
            // we add the sequence to the list of sequences
          // otherwise, it means that the token is an item
          else {
            // we convert to an integer
            Integer item = Integer.parseInt(token);
            // we check if it has the largest value because we
            // want to keep this information
            if (item >= maxItem) {
              maxItem = item;
            // we add the item to the current itemset.
    } catch (Exception e) {
    } finally {
      if (myInput != null) {

    //  We finished reading the database into memory.
    //  We will calculate statistics on this sequence database.

    System.out.println("============  SEQUENCE DATABASE STATS ==========");
    System.out.println("Number of sequences : " + sequences.size());

    // we initialize some variables that we will use to generate the statistics
    java.util.Set<Integer> items = new java.util.HashSet<Integer>(); // the set of all items
    List<Integer> sizes = new ArrayList<Integer>(); // the lengths of each sequence
    List<Integer> itemsetsizes = new ArrayList<Integer>(); // the lengths of each itemset
    List<Integer> differentitems =
        new ArrayList<Integer>(); // the number of different item for each sequence
    List<Integer> appearXtimesbySequence =
        new ArrayList<
            Integer>(); // the average number of times that items appearing in a sequence, appears
                        // in this sequence.
    // Loop on sequences from the database
    for (Sequence sequence : sequences) {
      // we add the size of this sequence to the list of sizes

      // this map is used to calculate the number of times that each item
      // appear in this sequence.
      // the key is an item
      // the value is the number of occurences of the item until now for this sequence
      HashMap<Integer, Integer> mapIntegers = new HashMap<Integer, Integer>();

      // Loop on itemsets from this sequence
      for (List<Integer> itemset : sequence.getItemsets()) {
        // we add the size of this itemset to the list of itemset sizes
        // Loop on items from this itemset
        for (Integer item : itemset) {
          // If the item is not in the map already, we set count to 0
          Integer count = mapIntegers.get(item);
          if (count == null) {
            count = 0;
          // otherwise we set the count to count +1
          count = count + 1;
          mapIntegers.put(item, count);
          // finally, we add the item to the set of items
      // we add all items found in this sequence to the global list
      // of different items for the database

      // for each item appearing in this sequence,
      // we put  the number of times in a global list "appearXtimesbySequence"
      // previously described.
      for (Entry<Integer, Integer> entry : mapIntegers.entrySet()) {

    // we print the statistics
    System.out.println("File " + path);
    System.out.println("Number of distinct items: " + items.size());
    System.out.println("Largest item id: " + maxItem);
        "Average number of itemsets per sequence : "
            + calculateMean(sizes)
            + " standard deviation: "
            + calculateStdDeviation(sizes)
            + " variance: "
            + calculateVariance(sizes));
        "Average number of distinct item per sequence : "
            + calculateMean(differentitems)
            + " standard deviation: "
            + calculateStdDeviation(differentitems)
            + " variance: "
            + calculateVariance(differentitems));
        "Average number of occurences in a sequence for each item appearing in a sequence : "
            + calculateMean(appearXtimesbySequence)
            + " standard deviation: "
            + calculateStdDeviation(appearXtimesbySequence)
            + " variance: "
            + calculateVariance(appearXtimesbySequence));
        "Average number of items per itemset : "
            + calculateMean(itemsetsizes)
            + " standard deviation: "
            + calculateStdDeviation(itemsetsizes)
            + " variance: "
            + calculateVariance(itemsetsizes));