Example #1
0
  /**
   * This method reads all the information in a DB and load it to memory.
   *
   * @param fileName is the database file name.
   * @param isTrain is a flag that indicate if the database is for a train or for a test.
   * @throws DatasetException if there is any semantical error in the input file.
   * @throws HeaderFormatException if there is any lexical or sintactical error in the header of the
   *     input file
   */
  public void readSet(String fileName, boolean isTrain)
      throws DatasetException, HeaderFormatException {
    String line;

    System.out.println("Opening the file: " + fileName + ".");
    // Parsing the header of the DB.
    errorLogger = new FormatErrorKeeper();

    // Declaring an instance parser
    InstanceParser parser = new InstanceParser(fileName, isTrain);

    // Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs
    parseHeader(parser, isTrain);

    System.out.println(
        " The number of output attributes is: " + Attributes.getOutputNumAttributes());

    // The attributes statistics are init if we are in train mode.
    if (isTrain && Attributes.getOutputNumAttributes() == 1) {
      Attributes.initStatistics();
    }

    // A temporal vector is used to store the instances read.

    System.out.println("\n\n  > Reading the data ");
    Vector tempSet = new Vector(1000, 100000);
    while ((line = parser.getLine()) != null) {
      // System.out.println ("    > Data line: " + line );
      tempSet.addElement(new Instance(line, isTrain, tempSet.size()));
    }

    // The vector of instances is converted to an array of instances.
    int sizeInstance = tempSet.size();
    System.out.println("    > Number of instances read: " + tempSet.size());
    instanceSet = new Instance[sizeInstance];
    for (int i = 0; i < sizeInstance; i++) {
      instanceSet[i] = (Instance) tempSet.elementAt(i);
    }
    // System.out.println("After converting all instances");

    // System.out.println("The error logger has any error: "+errorLogger.getNumErrors());
    if (errorLogger.getNumErrors() > 0) {

      System.out.println(
          "There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format.");
      for (int k = 0; k < errorLogger.getNumErrors(); k++) {
        errorLogger.getError(k).print();
      }
      throw new DatasetException(
          "There has been " + errorLogger.getAllErrors().size() + " errors in the Dataset format",
          errorLogger.getAllErrors());
    }

    System.out.println(
        "\n  > Finishing the statistics: (isTrain)"
            + isTrain
            + ", (# out attributes)"
            + Attributes.getOutputNumAttributes());
    // If being on a train dataset, the statistics are finished
    if (isTrain && Attributes.getOutputNumAttributes() == 1) {
      Attributes.finishStatistics();
    }

    System.out.println("  >> File LOADED CORRECTLY!!");
  } // end of InstanceSet constructor.