/** * SVM trainer * * @param dataTrain * @param dataTest */ public static void trainModelLibSVM(Instances dataTrain, Instances dataTest) { try { LibSVM classifier = new LibSVM(); CVParameterSelection ps = new CVParameterSelection(); ps.setClassifier(classifier); ps.setNumFolds(5); // using 5-fold CV // ps.addCVParameter("C 0.1 0.5 5"); // build and output best options ps.buildClassifier(dataTrain); Evaluation eval = new Evaluation(dataTrain); eval.evaluateModel(ps, dataTest); System.out.println("Results of the set :::::::::::::::::::::: "); System.out.println( "Percentage of correctly classified instances : " + eval.pctCorrect() + "\n" + "Percentage of incorrectly classified instances : " + eval.pctIncorrect()); System.out.println("No of correct predictions : " + eval.correct()); System.out.println("TRUTHFUL"); System.out.println( "Precision : " + eval.precision(0) + "\n" + "Recall : " + eval.recall(0) + "\n" + "F measure/score : " + eval.fMeasure(0)); System.out.println("DECEPTIVE"); System.out.println( "Precision : " + eval.precision(0) + "\n" + "Recall : " + eval.recall(1) + "\n" + "F measure/score : " + eval.fMeasure(1)); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public static void execSVM(String expName) { try { FileWriter outFile = null; PrintWriter out = null; outFile = new FileWriter(expName + "-SVM.results"); out = new PrintWriter(outFile); DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + expName + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("***************************************"); out.println("***\tEXECUTING TEST\t" + expName + "***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = new DataSource(expName + "-train.arff"); Instances dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador // cls = new weka.classifiers.functions.LibSVM(); int clase = dataTrain.numAttributes() - 1; cls = new weka.classifiers.bayes.ComplementNaiveBayes(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(expName + "-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); DataSource sourceTest = new DataSource(expName + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion float precision = 0; float recall = 0; float fmeasure = 0; int topeTopics = 8; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ CNB ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ CNB ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR ZeroR cls = new weka.classifiers.rules.ZeroR(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR J48 /* cls = new weka.classifiers.trees.J48(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ J48 ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ J48 ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); //OTRO SMO cls = new weka.classifiers.functions.SMO(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ SMO ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ SMO ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ out.flush(); out.close(); dataTest.delete(); dataTrain.delete(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void exec(PrintWriter printer) { try { FileWriter outFile = null; PrintWriter out = null; if (printer == null) { outFile = new FileWriter(id + ".results"); out = new PrintWriter(outFile); } else out = printer; DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + id + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); String cloneID = ""; boolean clonar = false; if (baseline) { System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST BASELINE\t***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); BaselineClassifier base = new BaselineClassifier(testcorpus, 8); precision = base.getPrecision(); recall = base.getRecall(); fmeasure = base.getFmeasure(); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); out.flush(); out.close(); return; } else { System.out.println("Stemming: " + stemming); System.out.println("Lematization:" + lematization); System.out.println("URLs:" + urls); System.out.println("Hashtags:" + hashtags); System.out.println("Mentions:" + mentions); System.out.println("Unigrams:" + unigrams); System.out.println("Bigrams:" + bigrams); System.out.println("TF:" + tf); System.out.println("TF-IDF:" + tfidf); out.println("Stemming: " + stemming); out.println("Lematization:" + lematization); out.println("URLs:" + urls); out.println("Hashtags:" + hashtags); out.println("Mentions:" + mentions); out.println("Unigrams:" + unigrams); out.println("Bigrams:" + bigrams); out.println("TF:" + tf); out.println("TF-IDF:" + tfidf); } // Si tengo los tweets procesados, me evito un nuevo proceso System.out.println("1-Process tweets " + dateFormat.format(new Date())); out.println("1-Process tweets " + dateFormat.format(new Date())); List<ProcessedTweet> train = null; String[] ids = id.split("-"); cloneID = ids[0] + "-" + (Integer.valueOf(ids[1]) + 6); if (((Integer.valueOf(ids[1]) / 6) % 2) == 0) clonar = true; if (new File(id + "-train.ptweets").exists()) { train = ProcessedTweetSerialization.fromFile(id + "-train.ptweets"); tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); if (lematization) { tweetsProcessor.doLematization(train); } if (stemming) { tweetsProcessor.doStemming(train); } } else { tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); // Esto del set training es un añadido para poder diferenciar los idiomas de las url en el // corpus paralelo // tweetsProcessor.setTraining(true); train = tweetsProcessor.processTweets(traincorpus); // tweetsProcessor.setTraining(false); ProcessedTweetSerialization.toFile(id + "-train.ptweets", train); /* if (clonar) { File f = new File (id+"-train.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+12)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+18)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+24)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+30)+"-train.ptweets").toPath(), options); } */ } // Generamos las BOW. Igual que antes, si existen no las creo. System.out.println("2-Fill topics " + dateFormat.format(new Date())); out.println("2-Fill topics " + dateFormat.format(new Date())); TopicsList topics = null; if (new File(id + ".topics").exists()) { topics = TopicsSerialization.fromFile(id + ".topics"); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); topics.prepareTopics(); } else { topics = new TopicsList(); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); System.out.println("Filling topics " + dateFormat.format(new Date())); topics.fillTopics(train); System.out.println("Preparing topics topics " + dateFormat.format(new Date())); // Aquí tengo que serializar antes de preparar, porque si no no puedo calcular los tf y // tfidf System.out.println("Serializing topics topics " + dateFormat.format(new Date())); /* if (clonar) { TopicsSerialization.toFile(cloneID+".topics", topics); } */ topics.prepareTopics(); TopicsSerialization.toFile(id + ".topics", topics); } System.out.println("3-Generate arff train file " + dateFormat.format(new Date())); out.println("3-Generate arff train file " + dateFormat.format(new Date())); // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-train.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-train.arff"); int tope = traincorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, train.get(indTweet)); } bw.flush(); bw.close(); } // Ahora proceso los datos de test System.out.println("5-build test dataset " + dateFormat.format(new Date())); out.println("5-build test dataset " + dateFormat.format(new Date())); List<ProcessedTweet> test = null; if (new File(id + "-test.ptweets").exists()) test = ProcessedTweetSerialization.fromFile(id + "-test.ptweets"); else { if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); test = tweetsProcessor.processTweets(testcorpus); ProcessedTweetSerialization.toFile(id + "-test.ptweets", test); /* if (clonar) { File f = new File (id+"-test.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-test.ptweets").toPath(), options); } */ } // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-test.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-test.arff"); int tope = testcorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, test.get(indTweet)); } bw.flush(); bw.close(); } int topeTopics = topics.getTopicsList().size(); topics.getTopicsList().clear(); // Genero el clasificador // FJRM 25-08-2013 Lo cambio de orden para intentar liberar la memoria de los topics y tener // más libre System.out.println("4-Generate classifier " + dateFormat.format(new Date())); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = null; Instances dataTrain = null; if (new File(id + "-MNB.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id + "-MNB.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { sourceTrain = new DataSource(id + "-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador cls = new weka.classifiers.bayes.NaiveBayesMultinomial(); int clase = dataTrain.numAttributes() - 1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id + "-MNB.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); // data.delete();//no borro para el svm } // Ahora evaluo el clasificador con los datos de test System.out.println("6-Evaluate classifier MNB " + dateFormat.format(new Date())); out.println("6-Evaluate classifier MNB" + dateFormat.format(new Date())); DataSource sourceTest = new DataSource(id + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); int clase = dataTest.numAttributes() - 1; dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); /* NO BORRAR System.out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); if (new File(id+"-SVM.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id+"-SVM.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { if (dataTrain==null) { sourceTrain = new DataSource(id+"-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); } //Entreno el clasificador cls = new weka.classifiers.functions.LibSVM(); clase = dataTrain.numAttributes()-1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id+"-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); dataTrain.delete(); } eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ System.out.println("Done " + dateFormat.format(new Date())); out.println("Done " + dateFormat.format(new Date())); if (printer == null) { out.flush(); out.close(); } // Intento de liberar memoria if (dataTrain != null) dataTrain.delete(); if (dataTest != null) dataTest.delete(); if (train != null) train.clear(); if (test != null) test.clear(); if (topics != null) { topics.getTopicsList().clear(); topics = null; } if (dataTest != null) dataTest.delete(); if (cls != null) cls = null; if (tweetsProcessor != null) tweetsProcessor = null; System.gc(); } catch (Exception e) { e.printStackTrace(); } }
public static void main(String[] args) throws Exception { BufferedReader reader = new BufferedReader(new FileReader("spambase.arff")); Instances data = new Instances(reader); reader.close(); // setting class attribute data.setClassIndex(data.numAttributes() - 1); int i = data.numInstances(); int j = data.numAttributes() - 1; File file = new File("tablelog.csv"); Writer output = null; output = new BufferedWriter(new FileWriter(file)); output.write( "%missing,auc1,correct1,fmeasure1,auc2,correct2,fmeasure2,auc3,correct3,fmeasure3\n"); Random randomGenerator = new Random(); data.randomize(randomGenerator); int numBlock = data.numInstances() / 2; double num0 = 0, num1 = 0, num2 = 0; /*mdata.instance(0).setMissing(0); mdata.deleteWithMissing(0); System.out.println(mdata.numInstances()+","+data.numInstances());*/ // Instances traindata=null; // Instances testdata=null; // System.out.println(data.instance(3).stringValue(1)); for (int perc = 10; perc < 101; perc = perc + 10) { Instances mdata = new Instances(data); int numMissing = perc * numBlock / 100; double y11[] = new double[2]; double y21[] = new double[2]; double y31[] = new double[2]; double y12[] = new double[2]; double y22[] = new double[2]; double y32[] = new double[2]; double y13[] = new double[2]; double y23[] = new double[2]; double y33[] = new double[2]; for (int p = 0; p < 2; p++) { Instances traindata = mdata.trainCV(2, p); Instances testdata = mdata.testCV(2, p); num0 = 0; num1 = 0; num2 = 0; for (int t = 0; t < numBlock; t++) { if (traindata.instance(t).classValue() == 0) num0++; if (traindata.instance(t).classValue() == 1) num1++; // if (traindata.instance(t).classValue()==2) num2++; } // System.out.println(mdata.instance(0).classValue()); Instances trainwithmissing = new Instances(traindata); Instances testwithmissing = new Instances(testdata); for (int q = 0; q < j; q++) { int r = randomGenerator.nextInt((int) i / 2); for (int k = 0; k < numMissing; k++) { // int r = randomGenerator.nextInt((int) i/2); // int c = randomGenerator.nextInt(j); trainwithmissing.instance((r + k) % numBlock).setMissing(q); testwithmissing.instance((r + k) % numBlock).setMissing(q); } } // trainwithmissing.deleteWithMissing(0);System.out.println(traindata.numInstances()+","+trainwithmissing.numInstances()); Classifier cModel = (Classifier) new Logistic(); // try for different classifiers and datasets cModel.buildClassifier(trainwithmissing); Evaluation eTest1 = new Evaluation(trainwithmissing); eTest1.evaluateModel(cModel, testdata); // eTest.crossValidateModel(cModel,mdata,10,mdata.getRandomNumberGenerator(1)); y11[p] = num0 / numBlock * eTest1.areaUnderROC(0) + num1 / numBlock * eTest1.areaUnderROC(1) /*+num2/numBlock*eTest1.areaUnderROC(2)*/; y21[p] = eTest1.correct(); y31[p] = num0 / numBlock * eTest1.fMeasure(0) + num1 / numBlock * eTest1.fMeasure(1) /*+num2/numBlock*eTest1.fMeasure(2)*/; Classifier cModel2 = (Classifier) new Logistic(); cModel2.buildClassifier(traindata); Evaluation eTest2 = new Evaluation(traindata); eTest2.evaluateModel(cModel2, testwithmissing); y12[p] = num0 / numBlock * eTest2.areaUnderROC(0) + num1 / numBlock * eTest2.areaUnderROC(1) /*+num2/numBlock*eTest2.areaUnderROC(2)*/; y22[p] = eTest2.correct(); y32[p] = num0 / numBlock * eTest2.fMeasure(0) + num1 / numBlock * eTest2.fMeasure(1) /*+num2/numBlock*eTest2.fMeasure(2)*/; Classifier cModel3 = (Classifier) new Logistic(); cModel3.buildClassifier(trainwithmissing); Evaluation eTest3 = new Evaluation(trainwithmissing); eTest3.evaluateModel(cModel3, testwithmissing); y13[p] = num0 / numBlock * eTest3.areaUnderROC(0) + num1 / numBlock * eTest3.areaUnderROC(1) /*+num2/numBlock*eTest3.areaUnderROC(2)*/; y23[p] = eTest3.correct(); y33[p] = num0 / numBlock * eTest3.fMeasure(0) + num1 / numBlock * eTest3.fMeasure(1) /*+num2/numBlock*eTest3.fMeasure(2)*/; // System.out.println(num0+","+num1+","+num2+"\n"); } double auc1 = (y11[0] + y11[1]) / 2; double auc2 = (y12[0] + y12[1]) / 2; double auc3 = (y13[0] + y13[1]) / 2; double corr1 = (y21[0] + y21[1]) / i; double corr2 = (y22[0] + y22[1]) / i; double corr3 = (y23[0] + y23[1]) / i; double fm1 = (y31[0] + y31[1]) / 2; double fm2 = (y32[0] + y32[1]) / 2; double fm3 = (y33[0] + y33[1]) / 2; output.write( perc + "," + auc1 + "," + corr1 + "," + fm1 + "," + auc2 + "," + corr2 + "," + fm2 + "," + auc3 + "," + corr3 + "," + fm3 + "\n"); // System.out.println(num0); // mdata=data; } output.close(); }
private double[] classify(String test) { String[] lab = { "I.2", "I.3", "I.5", "I.6", "I.2.1", "I.2.6", "I.2.8", "I.3.5", "I.3.6", "I.3.7", "I.5.1", "I.5.2", "I.5.4", "I.6.3", "I.6.5", "I.6.8", }; int NSel = 1000; // Number of selection Filter[] filters = new Filter[2]; double[] x = new double[16]; double[] prd = new double[16]; double clsLabel; Ranker rank = new Ranker(); Evaluation eval = null; StringToWordVector stwv = new StringToWordVector(); weka.filters.supervised.attribute.AttributeSelection featSel = new weka.filters.supervised.attribute.AttributeSelection(); WordTokenizer wtok = new WordTokenizer(); String delim = " \r\n\t.,;:'\"()?!$*-&[]+/|\\"; InfoGainAttributeEval ig = new InfoGainAttributeEval(); String[] stwvOpts; wtok.setDelimiters(delim); Instances[] dataRaw = new Instances[10000]; DataSource[] source = new DataSource[16]; String str; Instances testset = null; DataSource testsrc = null; try { testsrc = new DataSource(test); testset = testsrc.getDataSet(); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } for (int j = 0; j < 16; j++) // 16 element 0-15 { try { str = lab[j]; source[j] = new DataSource( "D:/Users/nma1g11/workspace2/WebScraperFlatNew/dataPernode/new/" + str + ".arff"); dataRaw[j] = source[j].getDataSet(); } catch (Exception e) { e.printStackTrace(); } System.out.println(lab[j]); if (dataRaw[j].classIndex() == -1) dataRaw[j].setClassIndex(dataRaw[j].numAttributes() - 1); } if (testset.classIndex() == -1) testset.setClassIndex(testset.numAttributes() - 1); try { stwvOpts = weka.core.Utils.splitOptions( "-R first-last -W 1000000 -prune-rate -1.0 -C -T -I -N 1 -L -S -stemmer weka.core.stemmers.LovinsStemmer -M 2 "); stwv.setOptions(stwvOpts); stwv.setTokenizer(wtok); rank.setOptions(weka.core.Utils.splitOptions("-T -1.7976931348623157E308 -N 100")); rank.setNumToSelect(NSel); featSel.setEvaluator(ig); featSel.setSearch(rank); } catch (Exception e) { e.printStackTrace(); } filters[0] = stwv; filters[1] = featSel; System.out.println("Loading is Done!"); MultiFilter mfilter = new MultiFilter(); mfilter.setFilters(filters); FilteredClassifier classify = new FilteredClassifier(); classify.setClassifier( new NaiveBayesMultinomial()); ///////// Algorithm of The Classification ///////// classify.setFilter(mfilter); String ss2 = ""; try { Classifier[] clsArr = new Classifier[16]; clsArr = Classifier.makeCopies(classify, 16); String strcls = ""; List<String> clsList = new ArrayList<String>(); String s = null; String newcls = null; String lb = ""; String prev = ""; boolean flag = false; String Ocls = null; int q = 0; for (int i = 0; i < 16; i++) { for (int k = 0; k < testset.numInstances(); k++) { flag = false; s = testset.instance(k).stringValue(1); clsList.add(s); if (lab[i].equals(s)) { flag = true; newcls = s; } } clsArr[i].buildClassifier(dataRaw[i]); eval = new Evaluation(dataRaw[i]); for (int j = 0; j < testset.numInstances(); j++) { Ocls = testset.instance(j).stringValue(1); if (flag && !s.equals(null)) testset.instance(j).setClassValue(lab[i]); // ----------------------------------------- strcls = testset.instance(j).stringValue(1); if (i < 4) { if (strcls.substring(0, 3).equals(lab[i])) testset.instance(j).setClassValue(lab[i]); } else if (lab[i].substring(0, 3).equals(strcls)) testset.instance(j).setClassValue(lab[i]); // ------------------------------------------------ System.out.println( dataRaw[i].classAttribute().value(i) + " --- > Correct%:" + eval.pctCorrect() + " F-measure:" + eval.fMeasure(i)); if (!prev.equals(testset.instance(j).stringValue(0)) || !lab[i].equals(lb)) { clsLabel = clsArr[i].classifyInstance(testset.instance(j)); x = clsArr[i].distributionForInstance(testset.instance(j)); prd[i] = x[i]; System.out.println(" --- > prob: " + clsLabel); System.out.println(" --- > x :" + x[i]); System.out.println(clsLabel + " --> " + testset.classAttribute().value((int) clsLabel)); } testset.instance(j).setClassValue(Ocls); prev = testset.instance(j).stringValue(0); lb = lab[i]; } System.out.println("Done with " + lab[i].replace("99", "") + " !!!!!!!!!!!"); } System.out.println(eval.correct()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return prd; }
public void runFilter() throws Exception { System.out.println("filtering attributes..."); System.out.println("running weka filters and weka-libsvm"); File svmfile = new File(sentiAnalysis.DIR.concat(sentiAnalysis.outout.concat(".libsvm"))); LibSVMLoader libl = new LibSVMLoader(); libl.setFile(svmfile); Instances data = libl.getDataSet(); NumericToNominal nm = new NumericToNominal(); // Converting last index // attribute to type // nominal from numeric nm.setAttributeIndices("last"); // as the last index would be class // label for the data nm.setInputFormat(data); filteredData = Filter.useFilter(data, nm); // filtered data stored in // new Instances object AttrNo = filteredData.numAttributes(); // number of attributes in given // file RecordNo = filteredData.numInstances(); // Number of records in given // file lowerBound = 0; upperBound = AttrNo - 1; AttributeSelection atsl = new AttributeSelection(); Ranker search = new Ranker(); InfoGainAttributeEval infog = new InfoGainAttributeEval(); // Applying // Attribute // Selection // using // InfoGain // evaluator // with // Ranker // search atsl.setEvaluator(infog); atsl.setSearch(search); atsl.SelectAttributes(filteredData); InfoGain = atsl.rankedAttributes(); SelectedAttributes = atsl.selectedAttributes(); // count non zero infoGain int count = 0; for (int i = 0; i < InfoGain.length; i++) { count = (InfoGain[i][1] > 0) ? count + 1 : count; } System.out.println("writing attributes with non-zero InfoGain..."); FileWriter svmout = new FileWriter(sentiAnalysis.DIR.concat(sentiAnalysis.outout.concat("_new.libsvm"))); for (int i = 0; i < RecordNo; i++) { int index = 1; svmout.write((int) filteredData.instance(i).value(filteredData.classIndex()) + " "); for (int j = 0; j < count; j++) { svmout.write( index + ":" + (int) filteredData.instance(i).value((int) InfoGain[j][0]) + " "); index++; } svmout.write("\n"); } svmout.close(); // filtered File newsvm = new File(sentiAnalysis.DIR.concat(sentiAnalysis.outout.concat("_new.libsvm"))); LibSVMLoader liblnew = new LibSVMLoader(); liblnew.setFile(newsvm); Instances newdata = liblnew.getDataSet(); nm = new NumericToNominal(); // Converting last index attribute to type // nominal from numeric nm.setAttributeIndices("last"); // as the last index would be class // label for the data nm.setInputFormat(newdata); Instances filteredDataNew = Filter.useFilter(newdata, nm); // filtered // data // stored in // new // Instances // object // test file File newsvmtest = new File(sentiAnalysis.DIR.concat(sentiAnalysis.outout.concat("_test.libsvm"))); LibSVMLoader libltest = new LibSVMLoader(); libltest.setFile(newsvmtest); Instances newdatatest = libltest.getDataSet(); nm = new NumericToNominal(); // Converting last index attribute to type // nominal from numeric nm.setAttributeIndices("last"); // as the last index would be class // label for the data nm.setInputFormat(newdatatest); Instances filteredDataTest = Filter.useFilter(newdatatest, nm); // filtered // data // stored // in // new // Instances // object // weka.classifiers.functions.LibSVM -S 0 -K 2 -D 3 -G 0.0 -R 0.0 -N 0.5 // -M 40.0 -C 1.0 -E 0.001 -P 0.1 -seed 1 String[] options = new String[1]; options[0] = "-S 0 -K 2 -D 3 -G 0.1 -R 0.0 -N 0.5 -M 40.0 -C 1.0 -E 0.001 -P 0.1 -seed 1 -h 0"; System.out.println("building classifier..."); LibSVM svm_model = new LibSVM(); svm_model.setOptions(options); // set the options svm_model.buildClassifier(filteredData); // build classifier DecimalFormat df = new DecimalFormat("0.00"); System.out.println("running cross validation..."); Evaluation eval = new Evaluation(filteredData); // eval.crossValidateModel(svm_model, filteredDataNew, 10, new // Random(1)); eval.evaluateModel(svm_model, filteredDataTest); FileWriter results = new FileWriter(sentiAnalysis.DIR.concat(sentiAnalysis.outout.concat("_results.txt"))); results.write("Classifier 1: Support Vector Machines\n"); results.write("Positive class precision: " + df.format(eval.precision(0)) + "\n"); results.write("Positive class recall: " + df.format(eval.recall(0)) + "\n"); results.write("Positive class f-score: " + df.format(eval.fMeasure(0)) + "\n"); results.write("Negative class precision: " + df.format(eval.precision(0)) + "\n"); results.write("Negative class recall: " + df.format(eval.precision(0)) + "\n"); results.write("Negative class f-score: " + df.format(eval.fMeasure(0)) + "\n"); System.out.println("generating results..."); System.out.println("*" + sentiAnalysis.outout + "*\t" + "\tPositive\tNegative\tNeutral"); System.out.println( "Precision\t" + df.format(eval.precision(0)) + "\t" + df.format(eval.precision(2)) + "\t" + df.format(eval.precision(1))); System.out.println( "Recall\t" + df.format(eval.recall(0)) + "\t" + df.format(eval.recall(2)) + "\t" + df.format(eval.recall(1))); System.out.println( "F-score\t" + df.format(eval.fMeasure(0)) + "\t" + df.format(eval.fMeasure(2)) + "\t" + df.format(eval.fMeasure(1))); results.close(); }