Ejemplo n.º 1
0
  public Instances getInstances(List<ImageData> data) {

    CSVLoader loader = new CSVLoader();
    Instances instances;

    try {
      // Create a temp csv file
      tempFile = new File("tmp");

      PrintWriter pw = null;

      try {
        pw = new PrintWriter(tempFile);
      } catch (FileNotFoundException e) {
        throw new Error(e);
      }

      // Load the data into the csv file
      for (int i = 0; i < Reader.featureSize; i++) {
        pw.print(i + ",");
      }

      pw.println("class");

      for (int i = 0; i < data.size(); i++) {

        List<Double> features = data.get(i).getFeatures();
        for (int j = 0; j < features.size(); j++) {
          pw.print(features.get(j) + ",");
        }

        pw.println(data.get(i).getClassType());
        pw.flush();
      }

      // Load the instances from the temp csv file
      loader.setSource(new File("tmp"));
      instances = loader.getDataSet();
      instances.setClass(instances.attribute("class"));

      return instances;

    } catch (IOException e) {
      throw new Error(e);
    } finally {
      if (tempFile != null) {
        tempFile.delete();
        tempFile = null;
      }
    }
  }
Ejemplo n.º 2
0
  /**
   * Loads results from a set of instances contained in the supplied file.
   *
   * @param f a value of type 'File'
   */
  protected void setInstancesFromFile(File f) {

    String fileType = f.getName();
    try {
      m_FromLab.setText("Reading from file...");
      if (f.getName().toLowerCase().endsWith(Instances.FILE_EXTENSION)) {
        fileType = "arff";
        Reader r = new BufferedReader(new FileReader(f));
        setInstances(new Instances(r));
        r.close();
      } else if (f.getName().toLowerCase().endsWith(CSVLoader.FILE_EXTENSION)) {
        fileType = "csv";
        CSVLoader cnv = new CSVLoader();
        cnv.setSource(f);
        Instances inst = cnv.getDataSet();
        setInstances(inst);
      } else {
        throw new Exception("Unrecognized file type");
      }
    } catch (Exception ex) {
      m_FromLab.setText("File '" + f.getName() + "' not recognised as an " + fileType + " file.");
      if (JOptionPane.showOptionDialog(
              ResultsPanel.this,
              "File '"
                  + f.getName()
                  + "' not recognised as an "
                  + fileType
                  + " file.\n"
                  + "Reason:\n"
                  + ex.getMessage(),
              "Load Instances",
              0,
              JOptionPane.ERROR_MESSAGE,
              null,
              new String[] {"OK"},
              null)
          == 1) {}
    }
  }
Ejemplo n.º 3
0
  public static void test_NHBS_old() throws Exception {
    // load the data
    CSVLoader loader = new CSVLoader();
    // these must come before the getDataSet()
    // loader.setEnclosureCharacters(",\'\"S");
    // loader.setNominalAttributes("16,71"); //zip code, drug name
    // loader.setStringAttributes("");
    // loader.setDateAttributes("0,1");
    // loader.setSource(new File("hcv/data/NHBS/IDU2_HCV_model_012913_cleaned_for_weka.csv"));
    loader.setSource(new File("/home/sasha/hcv/code/data/IDU2_HCV_model_012913_cleaned.csv"));
    Instances nhbs_data = loader.getDataSet();
    loader.setMissingValue("NOVALUE");
    // loader.setMissingValue("");

    nhbs_data.deleteAttributeAt(12); // zip code
    nhbs_data.deleteAttributeAt(1); // date - redundant with age
    nhbs_data.deleteAttributeAt(0); // date
    System.out.println("classifying attribute:");
    nhbs_data.setClassIndex(1); // new index  3->2->1
    nhbs_data.attribute(1).getMetadata().toString(); // HCVEIARSLT1

    // wishlist: perhaps it would be smarter to throw out unclassified instance?  they interfere
    // with the scoring
    nhbs_data.deleteWithMissingClass();
    // nhbs_data.setClass(new Attribute("HIVRSLT"));//.setClassIndex(1); //2nd column.  all are
    // mostly negative
    // nhbs_data.setClass(new Attribute("HCVEIARSLT1"));//.setClassIndex(2); //3rd column

    // #14, i.e. rds_fem, should be made numeric
    System.out.println("NHBS IDU 2009 Dataset");
    System.out.println("Summary of input:");
    // System.out.printlnnhbs_data.toSummaryString());
    System.out.println("  Num of classes: " + nhbs_data.numClasses());
    System.out.println("  Num of attributes: " + nhbs_data.numAttributes());
    for (int idx = 0; idx < nhbs_data.numAttributes(); ++idx) {
      Attribute attr = nhbs_data.attribute(idx);
      System.out.println("" + idx + ": " + attr.toString());
      System.out.println("     distinct values:" + nhbs_data.numDistinctValues(idx));
      // System.out.println("" + attr.enumerateValues());
    }

    // System.exit(0);
    // nhbs_data.deleteAttributeAt(0); //response ID
    // nhbs_data.deleteAttributeAt(16); //zip

    // Classifier classifier = new NNge(); //best nearest-neighbor classifier: 40.00
    // Classifier classifier = new MINND();
    // Classifier classifier = new CitationKNN();
    // Classifier classifier = new LibSVM(); //requires LibSVM classes. only gets 37.7%
    // Classifier classifier = new SMOreg();
    // Classifier classifier = new LinearNNSearch();

    // LinearRegression: Cannot handle multi-valued nominal class!
    // Classifier classifier = new LinearRegression();

    Classifier classifier = new RandomForest();
    String[] options = {
      "-I", "100", "-K", "4"
    }; // -I trees, -K features per tree.  generally, might want to optimize (or not
       // https://cwiki.apache.org/confluence/display/MAHOUT/Random+Forests)
    classifier.setOptions(options);
    // Classifier classifier = new Logistic();

    // KStar classifier = new KStar();
    // classifier.setGlobalBlend(20); //the amount of not greedy, in percent

    // does poorly
    // Classifier classifier = new AdaBoostM1();
    // Classifier classifier = new MultiBoostAB();
    // Classifier classifier = new Stacking();

    // building a C45 tree classifier
    // J48 classifier = new J48(); // new instance of tree
    // String[] options = new String[1];
    // options[0] = "-U"; // unpruned tree
    // classifier.setOptions(options); // set the options
    // classifier.buildClassifier(nhbs_data); // build classifier

    // wishlist: remove infrequent values
    // weka.filters.unsupervised.instance.RemoveFrequentValues()
    Filter f1 = new RemoveUseless();
    f1.setInputFormat(nhbs_data);
    nhbs_data = Filter.useFilter(nhbs_data, f1);

    // evaluation
    Evaluation eval = new Evaluation(nhbs_data);
    eval.crossValidateModel(classifier, nhbs_data, 10, new Random(1));
    System.out.println(eval.toSummaryString("\nResults\n\n", false));
    System.out.println(eval.toClassDetailsString());
    // System.out.println(eval.toCumulativeMarginDistributionString());
  }
  public static void main(String[] args) {

    if (args.length < 1) {
      System.out.println("usage: C4_5TweetTopicCategorization <root_path>");
      System.exit(-1);
    }

    String rootPath = args[0];
    File dataFolder = new File(rootPath + "/data");
    String resultFolderPath = rootPath + "/results/C4_5/";

    CrisisMailer crisisMailer = CrisisMailer.getCrisisMailer();
    Logger logger = Logger.getLogger(C4_5TweetTopicCategorization.class);
    PropertyConfigurator.configure(Constants.LOG4J_PROPERTIES_FILE_PATH);

    File resultFolder = new File(resultFolderPath);
    if (!resultFolder.exists()) resultFolder.mkdir();

    CSVLoader csvLoader = new CSVLoader();

    try {
      for (File dataSetName : dataFolder.listFiles()) {

        Instances data = null;
        try {
          csvLoader.setSource(dataSetName);
          csvLoader.setStringAttributes("2");
          data = csvLoader.getDataSet();
        } catch (IOException ioe) {
          logger.error(ioe);
          crisisMailer.sendEmailAlert(ioe);
          System.exit(-1);
        }

        data.setClassIndex(data.numAttributes() - 1);
        data.deleteWithMissingClass();

        Instances vectorizedData = null;
        StringToWordVector stringToWordVectorFilter = new StringToWordVector();
        try {
          stringToWordVectorFilter.setInputFormat(data);
          stringToWordVectorFilter.setAttributeIndices("2");
          stringToWordVectorFilter.setIDFTransform(true);
          stringToWordVectorFilter.setLowerCaseTokens(true);
          stringToWordVectorFilter.setOutputWordCounts(false);
          stringToWordVectorFilter.setUseStoplist(true);

          vectorizedData = Filter.useFilter(data, stringToWordVectorFilter);
          vectorizedData.deleteAttributeAt(0);
          // System.out.println(vectorizedData);
        } catch (Exception exception) {
          logger.error(exception);
          crisisMailer.sendEmailAlert(exception);
          System.exit(-1);
        }

        J48 j48Classifier = new J48();

        /*
        FilteredClassifier filteredClassifier = new FilteredClassifier();
        filteredClassifier.setFilter(stringToWordVectorFilter);
        filteredClassifier.setClassifier(j48Classifier);
        */

        try {
          Evaluation eval = new Evaluation(vectorizedData);
          eval.crossValidateModel(
              j48Classifier, vectorizedData, 5, new Random(System.currentTimeMillis()));

          FileOutputStream resultOutputStream =
              new FileOutputStream(new File(resultFolderPath + dataSetName.getName()));

          resultOutputStream.write(eval.toSummaryString("=== Summary ===", false).getBytes());
          resultOutputStream.write(eval.toMatrixString().getBytes());
          resultOutputStream.write(eval.toClassDetailsString().getBytes());
          resultOutputStream.close();

        } catch (Exception exception) {
          logger.error(exception);
          crisisMailer.sendEmailAlert(exception);
          System.exit(-1);
        }
      }
    } catch (Exception exception) {
      logger.error(exception);
      crisisMailer.sendEmailAlert(exception);
      System.out.println(-1);
    }
  }
Ejemplo n.º 5
0
  public static void main(String[] args) throws Exception {

    /*
     * First we load our preditons from the CSV formatted file.
     */
    CSVLoader predictCsvLoader = new CSVLoader();
    predictCsvLoader.setSource(new File("predict.csv"));

    /*
     * Since we are not using the ARFF format here, we have to give the
     * loader a little bit of information about the data types. Columns
     * 3,8,10 need to be of type string and columns 1,4,11 are nominal
     * types.
     */
    predictCsvLoader.setStringAttributes("3,8,10");
    predictCsvLoader.setNominalAttributes("1,4,11");
    Instances predictDataSet = predictCsvLoader.getDataSet();

    /*
     * Here we set the attribute we want to test the predicitons with
     */
    Attribute testAttribute = predictDataSet.attribute(0);
    predictDataSet.setClass(testAttribute);

    /*
     * We still have to remove all string attributes before we can test
     */
    predictDataSet.deleteStringAttributes();

    /*
     * Next we load the training data from our ARFF file
     */
    ArffLoader trainLoader = new ArffLoader();
    trainLoader.setSource(new File("train.arff"));
    trainLoader.setRetrieval(Loader.BATCH);
    Instances trainDataSet = trainLoader.getDataSet();

    /*
     * Now we tell the data set which attribute we want to classify, in our
     * case, we want to classify the first column: survived
     */
    Attribute trainAttribute = trainDataSet.attribute(0);
    trainDataSet.setClass(trainAttribute);

    /*
     * The RandomForest implementation cannot handle columns of type string,
     * so we remove them for now.
     */
    trainDataSet.deleteStringAttributes();

    /*
     * Now we read in the serialized model from disk
     */
    Classifier classifier = (Classifier) SerializationHelper.read("titanic.model");

    /*
     * Next we will use an Evaluation class to evaluate the performance of
     * our Classifier.
     */
    Evaluation evaluation = new Evaluation(trainDataSet);
    evaluation.evaluateModel(classifier, predictDataSet, new Object[] {});

    /*
     * After we evaluate the Classifier, we write out the summary
     * information to the screen.
     */
    System.out.println(classifier);
    System.out.println(evaluation.toSummaryString());
  }
  public void processFolder(File folder) throws Exception {

    if (!folder.isDirectory()) {
      // manipulate file here
      String fileName = folder.getName();
      System.out.println(fileName);
      // String extension = getFileExtension(fileName);
      testdata = new Instances(new BufferedReader(new FileReader(folder)));
      if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) {
        CSVLoader loader = new CSVLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".txt")) {

        TextDirectoryLoader loader = new TextDirectoryLoader();
        System.out.println("About to load text file " + fileName);
        System.out.println("Name of path " + folder.getAbsolutePath());
        loader.setSource(folder);
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();

      } else if (!fileName.startsWith(".") && fileName.contains(".json")) {
        JSONLoader loader = new JSONLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) {
        XRFFLoader loader = new XRFFLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".arff")) {
        traindata = new Instances(new BufferedReader(new FileReader(folder.getAbsolutePath())));
        testdata = new Instances(new BufferedReader(new FileReader(folder)));
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) {
        DatabaseConnection loader = new DatabaseConnection();
        loader.connectToDatabase();
        InstanceQuery query = new InstanceQuery();
        query.setUsername("lamogha");
        query.setPassword("l@mmyPHD");
        query.setQuery("select * from customers");
        // You can declare that your data set is sparse
        // query.setSparseData(true);
        Instances data = query.retrieveInstances();
        System.out.println(data.toSummaryString());
        this.chooseClassifier();
      }
    } else {

      for (final File fileEntry : folder.listFiles()) {
        if (fileEntry.isDirectory()) {
          this.processFolder(fileEntry);
        } else {
          // manipulate file here
          String fileName = fileEntry.getName();
          System.out.println(fileName);

          if (!fileName.startsWith(".")
              && (fileName.contains(".csv") || fileName.contains(".xls"))) {
            CSVLoader loader = new CSVLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".txt")) {

            TextDirectoryLoader loader = new TextDirectoryLoader();
            System.out.println("About to load text file " + fileName);
            System.out.println("Name of path " + fileEntry.getAbsolutePath());
            loader.setSource(folder);
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();

          } else if (!fileName.startsWith(".") && fileName.contains(".json")) {
            JSONLoader loader = new JSONLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) {
            XRFFLoader loader = new XRFFLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".")) {
            traindata =
                new Instances(new BufferedReader(new FileReader(fileEntry.getAbsolutePath())));
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) {
            DatabaseConnection loader = new DatabaseConnection();
            loader.connectToDatabase();
            InstanceQuery query = new InstanceQuery();
            query.setUsername("lamogha");
            query.setPassword("l@mmyPHD");
            query.setQuery("select * from customers");
            // You can declare that your data set is sparse
            // query.setSparseData(true);
            Instances data = query.retrieveInstances();
            System.out.println(data.toSummaryString());
            this.chooseClassifier();
          }
        }
      }
      // System.exit(0);
    }
  }
Ejemplo n.º 7
0
 public static Instances loadCSV(String fileName) throws IOException {
   CSVLoader algebraFeaturesCSVLoader = new CSVLoader();
   algebraFeaturesCSVLoader.setFile(new File(fileName));
   return algebraFeaturesCSVLoader.getDataSet();
 }