Example #1
0
  public void test_Interface(
      String[] my_options, String[] my_data, int invalidLineCnt, int validLineCnt)
      throws Exception {
    try {
      BufferedWriter out_csv = new BufferedWriter(new FileWriter(path_csv));
      for (String aMy_data : my_data) {
        out_csv.write(aMy_data + "\n");
      }
      out_csv.flush();
      out_csv.close();
    } catch (Exception e) {
      System.err.print(e.getMessage());
    }

    CSVLoader.testMode = true;
    CSVLoader.main(my_options);
    // do the test

    VoltTable modCount;
    modCount = client.callProcedure("@AdHoc", "SELECT * FROM BLAH;").getResults()[0];
    System.out.println("data inserted to table BLAH:\n" + modCount);
    int rowct = modCount.getRowCount();

    // Call validate partitioning to check if we are good.
    VoltTable valTable;
    valTable = client.callProcedure("@ValidatePartitioning", null, null).getResults()[0];
    System.out.println("Validate for BLAH:\n" + valTable);
    while (valTable.advanceRow()) {
      long miscnt = valTable.getLong("MISPARTITIONED_ROWS");
      assertEquals(miscnt, 0);
    }

    BufferedReader csvreport = new BufferedReader(new FileReader(CSVLoader.pathReportfile));
    int lineCount = 0;
    String line;
    String promptMsg = "Number of rows successfully inserted:";
    String promptFailMsg = "Number of rows that could not be inserted:";
    int invalidlinecnt = 0;

    while ((line = csvreport.readLine()) != null) {
      if (line.startsWith(promptMsg)) {
        String num = line.substring(promptMsg.length());
        lineCount = Integer.parseInt(num.replaceAll("\\s", ""));
      }
      if (line.startsWith(promptFailMsg)) {
        String num = line.substring(promptFailMsg.length());
        invalidlinecnt = Integer.parseInt(num.replaceAll("\\s", ""));
      }
    }
    csvreport.close();
    System.out.println(String.format("The rows infected: (%d,%s)", lineCount, rowct));
    assertEquals(lineCount, rowct);
    // assert validLineCnt specified equals the successfully inserted lineCount
    assertEquals(validLineCnt, lineCount);
    assertEquals(invalidLineCnt, invalidlinecnt);
  }
Example #2
0
  public static void main(String[] args) {

    final boolean precomputed = false;

    Instances trainData = null, testData = null;
    final boolean crossValidation = args.length == 1;

    try {
      String dataFile = args[0];
      System.err.println("INFO: Loading dataset from '" + dataFile + "' ...");

      CSVLoader csvLoader = new CSVLoader();
      // csvLoader.setStringAttributes("first"); // id
      // csvLoader.setNominalAttributes("last"); // label
      csvLoader.setNominalAttributes("first,last"); // id, label
      csvLoader.setSource(new FileInputStream(dataFile));

      Instances data = csvLoader.getDataSet();

      if (!crossValidation) {
        System.err.println(
            "INFO: Loading splits from '" + args[1] + "' (train), '" + args[2] + "' (test) ...");
        Set<String> trainLabels =
            Collections.unmodifiableSet(new HashSet<String>(readAsList(args[1])));
        Set<String> testLabels =
            Collections.unmodifiableSet(new HashSet<String>(readAsList(args[2])));
        {
          Set<String> intersection = new HashSet<String>(trainLabels);
          intersection.retainAll(testLabels);
          if (!intersection.isEmpty())
            throw new IllegalStateException(
                "Train and test sets intersect: " + Arrays.toString(intersection.toArray()));
        }
        RemoveWithStringValues rwsv = new RemoveWithStringValues();
        rwsv.setAttributeIndex("first");
        trainData = rwsv.setValues(trainLabels).split(data, true, false);
        testData = rwsv.setValues(testLabels).split(data, true, false);

        if (trainData.classIndex() == -1) trainData.setClassIndex(trainData.numAttributes() - 1);
        if (testData.classIndex() == -1) testData.setClassIndex(testData.numAttributes() - 1);

        data = null;
      } else if (!precomputed) {
        if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1);
        data.deleteWithMissingClass();
      }

      // feature selection
      // if (crossValidation)
      // {
      // InfoGainFeatureSelection igfs = new InfoGainFeatureSelection(10,
      // true);
      // igfs.build(data);
      //
      // System.err.println("INFO: Selected " +
      // igfs.selectedAttributes().size() +" of " +
      // (data.numAttributes()-2) + " attributes.");
      //
      // Remove r = new Remove();
      // r.setAttributeIndices("first," + igfs.selectedAttributeRange()
      // +",last");
      // r.setInvertSelection(true);
      //
      // r.setInputFormat(data);
      // data = Filter.useFilter(data, r);
      // if (data.classIndex() == -1)
      // data.setClassIndex(data.numAttributes() - 1);
      // }

      // standardize
      // {
      // Standardize st = new Standardize();
      //
      // st.setInputFormat(data);
      // data = Filter.useFilter(data, st);
      // if (data.classIndex() == -1)
      // data.setClassIndex(data.numAttributes() - 1);
      // }

      // weighting to balance classes
      // {
      // System.err.println("INFO: Weighting instances...");
      // // collect statistics
      // int[] classSupport = new int[data.numClasses()];
      // int hasClass = 0;
      // for (int i = 0; i < data.numInstances(); i++) {
      // if (data.instance(i).classIsMissing())
      // continue;
      // classSupport[(int) data.instance(i).classValue()]++;
      // hasClass++;
      // }
      //
      // // calculate weights
      // double[] classWeight = new double[data.numClasses()];
      // final int smoothFactor = 2;
      // double expectedFrequency = 1.0d / data.numClasses();
      // for (int i = 0; i < data.numClasses(); i++) {
      // final double frequency = classSupport[i]
      // / (double) hasClass;
      // final double ratio = expectedFrequency / frequency;
      // classWeight[i] = (smoothFactor + ratio)/(smoothFactor + 1);
      // System.err.println("INFO: Class '" +
      // data.classAttribute().value(i) + "' instance weight set to " +
      // (float)classWeight[i] );
      // }
      //
      // for (int i = 0; i < data.numInstances(); i++) {
      // if (data.instance(i).classIsMissing())
      // continue;
      // Instance inst = data.instance(i);
      // inst.setWeight(classWeight[(int) inst.classValue()]);
      // }
      //
      // }

      if (crossValidation) {
        for (int i = 0; i < data.numAttributes(); i++) {
          if (i == 0 || i == data.classIndex()) continue;
          Attribute a = data.attribute(i);
          if (!a.isNumeric()) throw new IllegalStateException("attribute is not numeric: " + a);
        }
      }

      final Classifier c;
      {
        if (!precomputed) {
          // classifier
          SMO smo = new SMO();
          // c = smo;

          FilteredClassifier smoRegex = prefixFilteredClassifier(new SMO(), "regex_");

          HackedClassifier smoRegexHacked = new HackedClassifier();
          smoRegexHacked.setSureClasses(Collections.singleton(Label.DM_Dermatology.code));
          smoRegexHacked.setClassifier(smoRegex);

          PrecomputedClassifier pc =
              new PrecomputedClassifier(
                  Maps.readStringStringMapFromFile(
                      new File("resources/classificationResults_sampleRun.train+test.csv"),
                      Charset.forName("US-ASCII"),
                      ",",
                      new HashMap<String, String>(),
                      true,
                      1));
          pc.setSmoothFactor(0.7);
          weka.classifiers.meta.Vote v = new Vote();
          v.setOptions(new String[] {"-R", "AVG"});
          v.setClassifiers(
              new Classifier[] {wrapRemoveFirst(smo), wrapRemoveFirst(smoRegexHacked), pc});

          //					c = v;
          c = wrapRemoveFirst(smo);

          // GaussianProcesses gp = new GaussianProcesses ();
          // c = gp;

          // LADTree t = new LADTree();
          // c = t;

          // LibSVM svm = new LibSVM();
          // c = svm;

          // JRip jrip = new JRip();
          // c = jrip;

          // LogitBoost lb = new LogitBoost();
          // lb.setClassifier(new SMOreg());
          // lb.setDebug(true);
          // c = lb;
          // BayesNet bn = new BayesNet();
          // // c = bn;
          //
          // J48 j48 = new J48();
          // // j48.setMinNumObj(10);
          // // c = j48;
          //
          // ClassificationViaRegression cvr = new
          // ClassificationViaRegression();
          // cvr.setClassifier(new SMOreg());
          // c = cvr;

          // Vote v = new Vote();
          // v.setClassifiers(new Classifier[] {smo, bn, j48});
          // c = v;

          // // HIERARCHY -- groups
          // HierachyNode root = new HierachyNode("ROOT");
          // Map<Group, HierachyNode> groupNodes = new HashMap<Group,
          // HierachyNode>();
          // for (Group g : Group.values()) {
          // HierachyNode groupNode = new HierachyNode(g.name());
          // root.addChild(groupNode);
          // groupNodes.put(g, groupNode);
          // }
          // for (Label l : Label.values()) {
          // if (l.code != null) {
          // HierachyNode labelNode = new HierachyNode(l.code);
          // groupNodes.get(l.group).addChild(labelNode);
          // } else
          // System.err
          // .println("WARNING: Skipping label without code  "
          // + l);
          // }
          // System.err.println(root.toString());
          //
          // HierarchicalClassifier hc = new
          // HierarchicalClassifier(smo,
          // root);
          // c = hc;

          // // HIERARCHY -- custom
          // HierachyNode root = new HierachyNode("ROOT");
          // HierachyNode radiology = new HierachyNode("RADIOLOGY");
          // HierachyNode graphic = new HierachyNode("GRAPHIC");
          // HierachyNode photo = new HierachyNode("PHOTO");
          // root.addChild(radiology);
          // root.addChild(graphic);
          // root.addChild(photo);
          // for (Label l : Label.values()) {
          // if (l.code != null) {
          // HierachyNode labelNode = new HierachyNode(l.code);
          //
          // switch (l) {
          // case _3D_ThreeDee:
          // graphic.addChild(labelNode);
          // break;
          // case AN_Angiography:
          // radiology.addChild(labelNode);
          // break;
          // case CM_CompoundFigure:
          // graphic.addChild(labelNode);
          // break;
          // case CT_ComputedTomography:
          // radiology.addChild(labelNode);
          // break;
          // case DM_Dermatology:
          // graphic.addChild(labelNode);
          // break;
          // case DR_Drawing:
          // graphic.addChild(labelNode);
          // break;
          // case EM_ElectronMicroscopy:
          // photo.addChild(labelNode);
          // break;
          // case EN_Endoscope:
          // photo.addChild(labelNode);
          // break;
          // case FL_Fluorescense:
          // radiology.addChild(labelNode);
          // break;
          // case GL_Gel:
          // graphic.addChild(labelNode);
          // break;
          // case GR_GrossPathology:
          // photo.addChild(labelNode);
          // break;
          // case GX_Graphs:
          // graphic.addChild(labelNode);
          // break;
          // case HX_Histopathology:
          // radiology.addChild(labelNode);
          // break;
          // case MR_MagneticResonance:
          // radiology.addChild(labelNode);
          // break;
          // case PX_Photo:
          // photo.addChild(labelNode);
          // break;
          // case RN_Retinograph:
          // radiology.addChild(labelNode);
          // break;
          // case US_Ultrasound:
          // radiology.addChild(labelNode);
          // break;
          // case XR_XRay:
          // radiology.addChild(labelNode);
          // break;
          // default:
          // throw new IllegalStateException(l.toString());
          // }
          //
          // } else
          // System.err
          // .println("WARNING: Skipping label without code  "+ l);
          // }
          // System.err.println(root.toString());
          //
          // HierarchicalClassifier hc = new
          // HierarchicalClassifier(smo,
          // root);
          // c = hc;

          // Cluster membership

          // EM em = new EM();
          // em.setNumClusters(Label.values().length); // TODO
          // heuristics
          //
          // ClusterMembership cm = new AddClusterMembership();
          // cm.setDensityBasedClusterer(em);
          // cm.setIgnoredAttributeIndices(String.valueOf(data.classIndex()+1));
          // // ignore class label
          //
          // FilteredClassifier fc = new FilteredClassifier();
          // fc.setClassifier(hc);
          // fc.setFilter(cm);
          //
          // c = fc;

          // ASEvaluation ae = new InfoGainAttributeEval();
          //
          // Ranker ranker = new Ranker();
          // ranker.setNumToSelect(data.numAttributes()/2);
          // AttributeSelectedClassifier asc = new
          // AttributeSelectedClassifier();
          // asc.setClassifier(smo);
          // asc.setEvaluator(ae);
          // asc.setSearch(ranker);
          //
          // c = asc;
        } else {
          c =
              new PrecomputedClassifier(
                  Maps.readStringStringMapFromFile(
                      new File("resources/classificationResults_sampleRun.txt"),
                      Charset.forName("US-ASCII"),
                      " ",
                      new HashMap<String, String>(),
                      true));
        }
      }

      c.setDebug(true);

      if (precomputed) evaluateClassifier(c, data, 10);
      else if (crossValidation) evaluateClassifier(c, data, 10);
      else
        // split
        evaluateClassifier(c, trainData, testData);

    } catch (Exception e) {
      System.err.println("ERROR: " + e.getMessage());
      e.printStackTrace();
      System.exit(1);
    }
  }