예제 #1
0
  public static void analyze_accuracy_NHBS(int rng_seed) throws Exception {
    HashMap<String, Object> population_params = load_defaults(null);
    RawLoader rl = new RawLoader(population_params, true, false, rng_seed);
    List<DrugUser> learningData = rl.getLearningData();

    Instances nhbs_data =
        new Instances("learning_instances", DrugUser.getAttInfo(), learningData.size());
    for (DrugUser du : learningData) {
      nhbs_data.add(du.getInstance());
    }
    System.out.println(nhbs_data.toSummaryString());
    nhbs_data.setClass(DrugUser.getAttribMap().get("hcv_state"));

    // wishlist: remove infrequent values
    // weka.filters.unsupervised.instance.RemoveFrequentValues()
    Filter f1 = new RemoveUseless();
    f1.setInputFormat(nhbs_data);
    nhbs_data = Filter.useFilter(nhbs_data, f1);

    System.out.println("NHBS IDU 2009 Dataset");
    System.out.println("Summary of input:");
    // System.out.printlnnhbs_data.toSummaryString());
    System.out.println("  Num of classes: " + nhbs_data.numClasses());
    System.out.println("  Num of attributes: " + nhbs_data.numAttributes());
    for (int idx = 0; idx < nhbs_data.numAttributes(); ++idx) {
      Attribute attr = nhbs_data.attribute(idx);
      System.out.println("" + idx + ": " + attr.toString());
      System.out.println("     distinct values:" + nhbs_data.numDistinctValues(idx));
      // System.out.println("" + attr.enumerateValues());
    }

    ArrayList<String> options = new ArrayList<String>();
    options.add("-Q");
    options.add("" + rng_seed);
    // System.exit(0);
    // nhbs_data.deleteAttributeAt(0); //response ID
    // nhbs_data.deleteAttributeAt(16); //zip

    // Classifier classifier = new NNge(); //best nearest-neighbor classifier: 40.00
    // ROC=0.60
    // Classifier classifier = new MINND();
    // Classifier classifier = new CitationKNN();
    // Classifier classifier = new LibSVM(); //requires LibSVM classes. only gets 37.7%
    // Classifier classifier = new SMOreg();
    Classifier classifier = new Logistic();
    // ROC=0.686
    // Classifier classifier = new LinearNNSearch();

    // LinearRegression: Cannot handle multi-valued nominal class!
    // Classifier classifier = new LinearRegression();

    // Classifier classifier = new RandomForest();
    // String[] options = {"-I", "100", "-K", "4"}; //-I trees, -K features per tree.  generally,
    // might want to optimize (or not
    // https://cwiki.apache.org/confluence/display/MAHOUT/Random+Forests)
    // options.add("-I"); options.add("100"); options.add("-K"); options.add("4");
    // ROC=0.673

    // KStar classifier = new KStar();
    // classifier.setGlobalBlend(20); //the amount of not greedy, in percent
    // ROC=0.633

    // Classifier classifier = new AdaBoostM1();
    // ROC=0.66
    // Classifier classifier = new MultiBoostAB();
    // ROC=0.67
    // Classifier classifier = new Stacking();
    // ROC=0.495

    // J48 classifier = new J48(); // new instance of tree //building a C45 tree classifier
    // ROC=0.585
    // String[] options = new String[1];
    // options[0] = "-U"; // unpruned tree
    // classifier.setOptions(options); // set the options

    classifier.setOptions((String[]) options.toArray(new String[0]));

    // not needed before CV: http://weka.wikispaces.com/Use+WEKA+in+your+Java+code
    // classifier.buildClassifier(nhbs_data); // build classifier

    // evaluation
    Evaluation eval = new Evaluation(nhbs_data);
    eval.crossValidateModel(classifier, nhbs_data, 10, new Random(1)); // 10-fold cross validation
    System.out.println(eval.toSummaryString("\nResults\n\n", false));
    System.out.println(eval.toClassDetailsString());
    // System.out.println(eval.toCumulativeMarginDistributionString());
  }
  public void processFolder(File folder) throws Exception {

    if (!folder.isDirectory()) {
      // manipulate file here
      String fileName = folder.getName();
      System.out.println(fileName);
      // String extension = getFileExtension(fileName);
      testdata = new Instances(new BufferedReader(new FileReader(folder)));
      if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) {
        CSVLoader loader = new CSVLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".txt")) {

        TextDirectoryLoader loader = new TextDirectoryLoader();
        System.out.println("About to load text file " + fileName);
        System.out.println("Name of path " + folder.getAbsolutePath());
        loader.setSource(folder);
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();

      } else if (!fileName.startsWith(".") && fileName.contains(".json")) {
        JSONLoader loader = new JSONLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) {
        XRFFLoader loader = new XRFFLoader();
        loader.setSource(new File(folder.getAbsolutePath()));
        traindata = loader.getDataSet();
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".arff")) {
        traindata = new Instances(new BufferedReader(new FileReader(folder.getAbsolutePath())));
        testdata = new Instances(new BufferedReader(new FileReader(folder)));
        System.out.println(traindata.toSummaryString());
        this.chooseClassifier();
      } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) {
        DatabaseConnection loader = new DatabaseConnection();
        loader.connectToDatabase();
        InstanceQuery query = new InstanceQuery();
        query.setUsername("lamogha");
        query.setPassword("l@mmyPHD");
        query.setQuery("select * from customers");
        // You can declare that your data set is sparse
        // query.setSparseData(true);
        Instances data = query.retrieveInstances();
        System.out.println(data.toSummaryString());
        this.chooseClassifier();
      }
    } else {

      for (final File fileEntry : folder.listFiles()) {
        if (fileEntry.isDirectory()) {
          this.processFolder(fileEntry);
        } else {
          // manipulate file here
          String fileName = fileEntry.getName();
          System.out.println(fileName);

          if (!fileName.startsWith(".")
              && (fileName.contains(".csv") || fileName.contains(".xls"))) {
            CSVLoader loader = new CSVLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".txt")) {

            TextDirectoryLoader loader = new TextDirectoryLoader();
            System.out.println("About to load text file " + fileName);
            System.out.println("Name of path " + fileEntry.getAbsolutePath());
            loader.setSource(folder);
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();

          } else if (!fileName.startsWith(".") && fileName.contains(".json")) {
            JSONLoader loader = new JSONLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) {
            XRFFLoader loader = new XRFFLoader();
            loader.setSource(new File(fileEntry.getAbsolutePath()));
            traindata = loader.getDataSet();
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".")) {
            traindata =
                new Instances(new BufferedReader(new FileReader(fileEntry.getAbsolutePath())));
            System.out.println(traindata.toSummaryString());
            this.chooseClassifier();
          } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) {
            DatabaseConnection loader = new DatabaseConnection();
            loader.connectToDatabase();
            InstanceQuery query = new InstanceQuery();
            query.setUsername("lamogha");
            query.setPassword("l@mmyPHD");
            query.setQuery("select * from customers");
            // You can declare that your data set is sparse
            // query.setSparseData(true);
            Instances data = query.retrieveInstances();
            System.out.println(data.toSummaryString());
            this.chooseClassifier();
          }
        }
      }
      // System.exit(0);
    }
  }