public void test_Interface( String[] my_options, String[] my_data, int invalidLineCnt, int validLineCnt) throws Exception { try { BufferedWriter out_csv = new BufferedWriter(new FileWriter(path_csv)); for (String aMy_data : my_data) { out_csv.write(aMy_data + "\n"); } out_csv.flush(); out_csv.close(); } catch (Exception e) { System.err.print(e.getMessage()); } CSVLoader.testMode = true; CSVLoader.main(my_options); // do the test VoltTable modCount; modCount = client.callProcedure("@AdHoc", "SELECT * FROM BLAH;").getResults()[0]; System.out.println("data inserted to table BLAH:\n" + modCount); int rowct = modCount.getRowCount(); // Call validate partitioning to check if we are good. VoltTable valTable; valTable = client.callProcedure("@ValidatePartitioning", null, null).getResults()[0]; System.out.println("Validate for BLAH:\n" + valTable); while (valTable.advanceRow()) { long miscnt = valTable.getLong("MISPARTITIONED_ROWS"); assertEquals(miscnt, 0); } BufferedReader csvreport = new BufferedReader(new FileReader(CSVLoader.pathReportfile)); int lineCount = 0; String line; String promptMsg = "Number of rows successfully inserted:"; String promptFailMsg = "Number of rows that could not be inserted:"; int invalidlinecnt = 0; while ((line = csvreport.readLine()) != null) { if (line.startsWith(promptMsg)) { String num = line.substring(promptMsg.length()); lineCount = Integer.parseInt(num.replaceAll("\\s", "")); } if (line.startsWith(promptFailMsg)) { String num = line.substring(promptFailMsg.length()); invalidlinecnt = Integer.parseInt(num.replaceAll("\\s", "")); } } csvreport.close(); System.out.println(String.format("The rows infected: (%d,%s)", lineCount, rowct)); assertEquals(lineCount, rowct); // assert validLineCnt specified equals the successfully inserted lineCount assertEquals(validLineCnt, lineCount); assertEquals(invalidLineCnt, invalidlinecnt); }
public static void main(String[] args) { final boolean precomputed = false; Instances trainData = null, testData = null; final boolean crossValidation = args.length == 1; try { String dataFile = args[0]; System.err.println("INFO: Loading dataset from '" + dataFile + "' ..."); CSVLoader csvLoader = new CSVLoader(); // csvLoader.setStringAttributes("first"); // id // csvLoader.setNominalAttributes("last"); // label csvLoader.setNominalAttributes("first,last"); // id, label csvLoader.setSource(new FileInputStream(dataFile)); Instances data = csvLoader.getDataSet(); if (!crossValidation) { System.err.println( "INFO: Loading splits from '" + args[1] + "' (train), '" + args[2] + "' (test) ..."); Set<String> trainLabels = Collections.unmodifiableSet(new HashSet<String>(readAsList(args[1]))); Set<String> testLabels = Collections.unmodifiableSet(new HashSet<String>(readAsList(args[2]))); { Set<String> intersection = new HashSet<String>(trainLabels); intersection.retainAll(testLabels); if (!intersection.isEmpty()) throw new IllegalStateException( "Train and test sets intersect: " + Arrays.toString(intersection.toArray())); } RemoveWithStringValues rwsv = new RemoveWithStringValues(); rwsv.setAttributeIndex("first"); trainData = rwsv.setValues(trainLabels).split(data, true, false); testData = rwsv.setValues(testLabels).split(data, true, false); if (trainData.classIndex() == -1) trainData.setClassIndex(trainData.numAttributes() - 1); if (testData.classIndex() == -1) testData.setClassIndex(testData.numAttributes() - 1); data = null; } else if (!precomputed) { if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1); data.deleteWithMissingClass(); } // feature selection // if (crossValidation) // { // InfoGainFeatureSelection igfs = new InfoGainFeatureSelection(10, // true); // igfs.build(data); // // System.err.println("INFO: Selected " + // igfs.selectedAttributes().size() +" of " + // (data.numAttributes()-2) + " attributes."); // // Remove r = new Remove(); // r.setAttributeIndices("first," + igfs.selectedAttributeRange() // +",last"); // r.setInvertSelection(true); // // r.setInputFormat(data); // data = Filter.useFilter(data, r); // if (data.classIndex() == -1) // data.setClassIndex(data.numAttributes() - 1); // } // standardize // { // Standardize st = new Standardize(); // // st.setInputFormat(data); // data = Filter.useFilter(data, st); // if (data.classIndex() == -1) // data.setClassIndex(data.numAttributes() - 1); // } // weighting to balance classes // { // System.err.println("INFO: Weighting instances..."); // // collect statistics // int[] classSupport = new int[data.numClasses()]; // int hasClass = 0; // for (int i = 0; i < data.numInstances(); i++) { // if (data.instance(i).classIsMissing()) // continue; // classSupport[(int) data.instance(i).classValue()]++; // hasClass++; // } // // // calculate weights // double[] classWeight = new double[data.numClasses()]; // final int smoothFactor = 2; // double expectedFrequency = 1.0d / data.numClasses(); // for (int i = 0; i < data.numClasses(); i++) { // final double frequency = classSupport[i] // / (double) hasClass; // final double ratio = expectedFrequency / frequency; // classWeight[i] = (smoothFactor + ratio)/(smoothFactor + 1); // System.err.println("INFO: Class '" + // data.classAttribute().value(i) + "' instance weight set to " + // (float)classWeight[i] ); // } // // for (int i = 0; i < data.numInstances(); i++) { // if (data.instance(i).classIsMissing()) // continue; // Instance inst = data.instance(i); // inst.setWeight(classWeight[(int) inst.classValue()]); // } // // } if (crossValidation) { for (int i = 0; i < data.numAttributes(); i++) { if (i == 0 || i == data.classIndex()) continue; Attribute a = data.attribute(i); if (!a.isNumeric()) throw new IllegalStateException("attribute is not numeric: " + a); } } final Classifier c; { if (!precomputed) { // classifier SMO smo = new SMO(); // c = smo; FilteredClassifier smoRegex = prefixFilteredClassifier(new SMO(), "regex_"); HackedClassifier smoRegexHacked = new HackedClassifier(); smoRegexHacked.setSureClasses(Collections.singleton(Label.DM_Dermatology.code)); smoRegexHacked.setClassifier(smoRegex); PrecomputedClassifier pc = new PrecomputedClassifier( Maps.readStringStringMapFromFile( new File("resources/classificationResults_sampleRun.train+test.csv"), Charset.forName("US-ASCII"), ",", new HashMap<String, String>(), true, 1)); pc.setSmoothFactor(0.7); weka.classifiers.meta.Vote v = new Vote(); v.setOptions(new String[] {"-R", "AVG"}); v.setClassifiers( new Classifier[] {wrapRemoveFirst(smo), wrapRemoveFirst(smoRegexHacked), pc}); // c = v; c = wrapRemoveFirst(smo); // GaussianProcesses gp = new GaussianProcesses (); // c = gp; // LADTree t = new LADTree(); // c = t; // LibSVM svm = new LibSVM(); // c = svm; // JRip jrip = new JRip(); // c = jrip; // LogitBoost lb = new LogitBoost(); // lb.setClassifier(new SMOreg()); // lb.setDebug(true); // c = lb; // BayesNet bn = new BayesNet(); // // c = bn; // // J48 j48 = new J48(); // // j48.setMinNumObj(10); // // c = j48; // // ClassificationViaRegression cvr = new // ClassificationViaRegression(); // cvr.setClassifier(new SMOreg()); // c = cvr; // Vote v = new Vote(); // v.setClassifiers(new Classifier[] {smo, bn, j48}); // c = v; // // HIERARCHY -- groups // HierachyNode root = new HierachyNode("ROOT"); // Map<Group, HierachyNode> groupNodes = new HashMap<Group, // HierachyNode>(); // for (Group g : Group.values()) { // HierachyNode groupNode = new HierachyNode(g.name()); // root.addChild(groupNode); // groupNodes.put(g, groupNode); // } // for (Label l : Label.values()) { // if (l.code != null) { // HierachyNode labelNode = new HierachyNode(l.code); // groupNodes.get(l.group).addChild(labelNode); // } else // System.err // .println("WARNING: Skipping label without code " // + l); // } // System.err.println(root.toString()); // // HierarchicalClassifier hc = new // HierarchicalClassifier(smo, // root); // c = hc; // // HIERARCHY -- custom // HierachyNode root = new HierachyNode("ROOT"); // HierachyNode radiology = new HierachyNode("RADIOLOGY"); // HierachyNode graphic = new HierachyNode("GRAPHIC"); // HierachyNode photo = new HierachyNode("PHOTO"); // root.addChild(radiology); // root.addChild(graphic); // root.addChild(photo); // for (Label l : Label.values()) { // if (l.code != null) { // HierachyNode labelNode = new HierachyNode(l.code); // // switch (l) { // case _3D_ThreeDee: // graphic.addChild(labelNode); // break; // case AN_Angiography: // radiology.addChild(labelNode); // break; // case CM_CompoundFigure: // graphic.addChild(labelNode); // break; // case CT_ComputedTomography: // radiology.addChild(labelNode); // break; // case DM_Dermatology: // graphic.addChild(labelNode); // break; // case DR_Drawing: // graphic.addChild(labelNode); // break; // case EM_ElectronMicroscopy: // photo.addChild(labelNode); // break; // case EN_Endoscope: // photo.addChild(labelNode); // break; // case FL_Fluorescense: // radiology.addChild(labelNode); // break; // case GL_Gel: // graphic.addChild(labelNode); // break; // case GR_GrossPathology: // photo.addChild(labelNode); // break; // case GX_Graphs: // graphic.addChild(labelNode); // break; // case HX_Histopathology: // radiology.addChild(labelNode); // break; // case MR_MagneticResonance: // radiology.addChild(labelNode); // break; // case PX_Photo: // photo.addChild(labelNode); // break; // case RN_Retinograph: // radiology.addChild(labelNode); // break; // case US_Ultrasound: // radiology.addChild(labelNode); // break; // case XR_XRay: // radiology.addChild(labelNode); // break; // default: // throw new IllegalStateException(l.toString()); // } // // } else // System.err // .println("WARNING: Skipping label without code "+ l); // } // System.err.println(root.toString()); // // HierarchicalClassifier hc = new // HierarchicalClassifier(smo, // root); // c = hc; // Cluster membership // EM em = new EM(); // em.setNumClusters(Label.values().length); // TODO // heuristics // // ClusterMembership cm = new AddClusterMembership(); // cm.setDensityBasedClusterer(em); // cm.setIgnoredAttributeIndices(String.valueOf(data.classIndex()+1)); // // ignore class label // // FilteredClassifier fc = new FilteredClassifier(); // fc.setClassifier(hc); // fc.setFilter(cm); // // c = fc; // ASEvaluation ae = new InfoGainAttributeEval(); // // Ranker ranker = new Ranker(); // ranker.setNumToSelect(data.numAttributes()/2); // AttributeSelectedClassifier asc = new // AttributeSelectedClassifier(); // asc.setClassifier(smo); // asc.setEvaluator(ae); // asc.setSearch(ranker); // // c = asc; } else { c = new PrecomputedClassifier( Maps.readStringStringMapFromFile( new File("resources/classificationResults_sampleRun.txt"), Charset.forName("US-ASCII"), " ", new HashMap<String, String>(), true)); } } c.setDebug(true); if (precomputed) evaluateClassifier(c, data, 10); else if (crossValidation) evaluateClassifier(c, data, 10); else // split evaluateClassifier(c, trainData, testData); } catch (Exception e) { System.err.println("ERROR: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }