/** * Creates an evaluation overview of the built classifier. * * @return the panel to be displayed as result evaluation view for the current decision point */ protected JPanel createEvaluationVisualization(Instances data) { // build text field to display evaluation statistics JTextPane statistic = new JTextPane(); try { // build evaluation statistics Evaluation evaluation = new Evaluation(data); evaluation.evaluateModel(myClassifier, data); statistic.setText( evaluation.toSummaryString() + "\n\n" + evaluation.toClassDetailsString() + "\n\n" + evaluation.toMatrixString()); } catch (Exception ex) { ex.printStackTrace(); return createMessagePanel("Error while creating the decision tree evaluation view"); } statistic.setFont(new Font("Courier", Font.PLAIN, 14)); statistic.setEditable(false); statistic.setCaretPosition(0); JPanel resultViewPanel = new JPanel(); resultViewPanel.setLayout(new BoxLayout(resultViewPanel, BoxLayout.PAGE_AXIS)); resultViewPanel.add(new JScrollPane(statistic)); return resultViewPanel; }
/** * 用分类器测试 * * @param trainFileName * @param testFileName */ public static void classify(String trainFileName, String testFileName) { try { File inputFile = new File(fileName + trainFileName); // 训练语料文件 ArffLoader atf = new ArffLoader(); atf.setFile(inputFile); Instances instancesTrain = atf.getDataSet(); // 读入训练文件 // 设置类标签类 inputFile = new File(fileName + testFileName); // 测试语料文件 atf.setFile(inputFile); Instances instancesTest = atf.getDataSet(); // 读入测试文件 instancesTest.setClassIndex(instancesTest.numAttributes() - 1); instancesTrain.setClassIndex(instancesTrain.numAttributes() - 1); classifier = (Classifier) Class.forName(CLASSIFIERNAME).newInstance(); classifier.buildClassifier(instancesTrain); Evaluation eval = new Evaluation(instancesTrain); // 第一个为一个训练过的分类器,第二个参数是在某个数据集上评价的数据集 eval.evaluateModel(classifier, instancesTest); System.out.println(eval.toClassDetailsString()); System.out.println(eval.toSummaryString()); System.out.println(eval.toMatrixString()); System.out.println("precision is :" + (1 - eval.errorRate())); } catch (Exception e) { e.printStackTrace(); } }
/** evaluates the classifier */ @Override public void evaluate() throws Exception { // evaluate classifier and print some statistics if (_test.classIndex() == -1) _test.setClassIndex(_test.numAttributes() - 1); Evaluation eval = new Evaluation(_train); eval.evaluateModel(_cl, _test); System.out.println(eval.toSummaryString("\nResults\n======\n", false)); System.out.println(eval.toMatrixString()); }
@Override public void crossValidation(String traindata) throws Exception { DataSource ds = new DataSource(traindata); Instances instances = ds.getDataSet(); StringToWordVector stv = new StringToWordVector(); stv.setOptions( weka.core.Utils.splitOptions( "-R first-last -W 1000 " + "-prune-rate -1.0 -N 0 " + "-stemmer weka.core.stemmers.NullStemmer -M 1 " + "-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\r\\n\\t.,;:\\\'\\\"()?!\"")); stv.setInputFormat(instances); instances = Filter.useFilter(instances, stv); instances.setClassIndex(0); Evaluation eval = new Evaluation(instances); eval.crossValidateModel(this.classifier, instances, 10, new Random(1)); System.out.println(eval.toSummaryString()); System.out.println(eval.toMatrixString()); }
/** outputs some data about the classifier */ public String toString() { StringBuffer result; result = new StringBuffer(); result.append("Weka - Demo\n===========\n\n"); result.append( "Classifier...: " + m_Classifier.getClass().getName() + " " + Utils.joinOptions(m_Classifier.getOptions()) + "\n"); if (m_Filter instanceof OptionHandler) result.append( "Filter.......: " + m_Filter.getClass().getName() + " " + Utils.joinOptions(((OptionHandler) m_Filter).getOptions()) + "\n"); else result.append("Filter.......: " + m_Filter.getClass().getName() + "\n"); result.append("Training file: " + m_TrainingFile + "\n"); result.append("\n"); result.append(m_Classifier.toString() + "\n"); result.append(m_Evaluation.toSummaryString() + "\n"); try { result.append(m_Evaluation.toMatrixString() + "\n"); } catch (Exception e) { e.printStackTrace(); } try { result.append(m_Evaluation.toClassDetailsString() + "\n"); } catch (Exception e) { e.printStackTrace(); } return result.toString(); }
public static void execSVM(String expName) { try { FileWriter outFile = null; PrintWriter out = null; outFile = new FileWriter(expName + "-SVM.results"); out = new PrintWriter(outFile); DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + expName + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("***************************************"); out.println("***\tEXECUTING TEST\t" + expName + "***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = new DataSource(expName + "-train.arff"); Instances dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador // cls = new weka.classifiers.functions.LibSVM(); int clase = dataTrain.numAttributes() - 1; cls = new weka.classifiers.bayes.ComplementNaiveBayes(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(expName + "-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); DataSource sourceTest = new DataSource(expName + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion float precision = 0; float recall = 0; float fmeasure = 0; int topeTopics = 8; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ CNB ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ CNB ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR ZeroR cls = new weka.classifiers.rules.ZeroR(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ ZEROR ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); // OTRO CLASIFICADOR J48 /* cls = new weka.classifiers.trees.J48(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ J48 ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ J48 ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); //OTRO SMO cls = new weka.classifiers.functions.SMO(); dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("++++++++++++++ SMO ++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("++++++++++++++ SMO ++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ out.flush(); out.close(); dataTest.delete(); dataTrain.delete(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void exec(PrintWriter printer) { try { FileWriter outFile = null; PrintWriter out = null; if (printer == null) { outFile = new FileWriter(id + ".results"); out = new PrintWriter(outFile); } else out = printer; DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); ProcessTweets tweetsProcessor = null; System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST\t" + id + "***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); String cloneID = ""; boolean clonar = false; if (baseline) { System.out.println("***************************************"); System.out.println("***\tEXECUTING TEST BASELINE\t***"); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println("Train size:" + traincorpus.size()); System.out.println("Test size:" + testcorpus.size()); out.println("***************************************"); out.println("***\tEXECUTING TEST\t***"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println("Train size:" + traincorpus.size()); out.println("Test size:" + testcorpus.size()); BaselineClassifier base = new BaselineClassifier(testcorpus, 8); precision = base.getPrecision(); recall = base.getRecall(); fmeasure = base.getFmeasure(); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); out.flush(); out.close(); return; } else { System.out.println("Stemming: " + stemming); System.out.println("Lematization:" + lematization); System.out.println("URLs:" + urls); System.out.println("Hashtags:" + hashtags); System.out.println("Mentions:" + mentions); System.out.println("Unigrams:" + unigrams); System.out.println("Bigrams:" + bigrams); System.out.println("TF:" + tf); System.out.println("TF-IDF:" + tfidf); out.println("Stemming: " + stemming); out.println("Lematization:" + lematization); out.println("URLs:" + urls); out.println("Hashtags:" + hashtags); out.println("Mentions:" + mentions); out.println("Unigrams:" + unigrams); out.println("Bigrams:" + bigrams); out.println("TF:" + tf); out.println("TF-IDF:" + tfidf); } // Si tengo los tweets procesados, me evito un nuevo proceso System.out.println("1-Process tweets " + dateFormat.format(new Date())); out.println("1-Process tweets " + dateFormat.format(new Date())); List<ProcessedTweet> train = null; String[] ids = id.split("-"); cloneID = ids[0] + "-" + (Integer.valueOf(ids[1]) + 6); if (((Integer.valueOf(ids[1]) / 6) % 2) == 0) clonar = true; if (new File(id + "-train.ptweets").exists()) { train = ProcessedTweetSerialization.fromFile(id + "-train.ptweets"); tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); if (lematization) { tweetsProcessor.doLematization(train); } if (stemming) { tweetsProcessor.doStemming(train); } } else { tweetsProcessor = new ProcessTweets(stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); // Esto del set training es un añadido para poder diferenciar los idiomas de las url en el // corpus paralelo // tweetsProcessor.setTraining(true); train = tweetsProcessor.processTweets(traincorpus); // tweetsProcessor.setTraining(false); ProcessedTweetSerialization.toFile(id + "-train.ptweets", train); /* if (clonar) { File f = new File (id+"-train.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+12)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+18)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+24)+"-train.ptweets").toPath(), options); Files.copy(p, new File (ids[0]+"-"+(Integer.valueOf(ids[1])+30)+"-train.ptweets").toPath(), options); } */ } // Generamos las BOW. Igual que antes, si existen no las creo. System.out.println("2-Fill topics " + dateFormat.format(new Date())); out.println("2-Fill topics " + dateFormat.format(new Date())); TopicsList topics = null; if (new File(id + ".topics").exists()) { topics = TopicsSerialization.fromFile(id + ".topics"); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); topics.prepareTopics(); } else { topics = new TopicsList(); if (tf) topics.setSelectionFeature(TopicDesc.TERM_TF); else topics.setSelectionFeature(TopicDesc.TERM_TF_IDF); System.out.println("Filling topics " + dateFormat.format(new Date())); topics.fillTopics(train); System.out.println("Preparing topics topics " + dateFormat.format(new Date())); // Aquí tengo que serializar antes de preparar, porque si no no puedo calcular los tf y // tfidf System.out.println("Serializing topics topics " + dateFormat.format(new Date())); /* if (clonar) { TopicsSerialization.toFile(cloneID+".topics", topics); } */ topics.prepareTopics(); TopicsSerialization.toFile(id + ".topics", topics); } System.out.println("3-Generate arff train file " + dateFormat.format(new Date())); out.println("3-Generate arff train file " + dateFormat.format(new Date())); // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-train.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-train.arff"); int tope = traincorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, train.get(indTweet)); } bw.flush(); bw.close(); } // Ahora proceso los datos de test System.out.println("5-build test dataset " + dateFormat.format(new Date())); out.println("5-build test dataset " + dateFormat.format(new Date())); List<ProcessedTweet> test = null; if (new File(id + "-test.ptweets").exists()) test = ProcessedTweetSerialization.fromFile(id + "-test.ptweets"); else { if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); test = tweetsProcessor.processTweets(testcorpus); ProcessedTweetSerialization.toFile(id + "-test.ptweets", test); /* if (clonar) { File f = new File (id+"-test.ptweets"); Path p = f.toPath(); CopyOption[] options = new CopyOption[]{ StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES }; Files.copy(p, new File (cloneID+"-test.ptweets").toPath(), options); } */ } // Si el fichero arff no existe, lo creo. en caso contrario vengo haciendo lo que hasta ahora, // aprovechar trabajo previo if (!new File(id + "-test.arff").exists()) { BufferedWriter bw = topics.generateArffHeader(id + "-test.arff"); int tope = testcorpus.size(); if (tweetsProcessor == null) tweetsProcessor = new ProcessTweets( stemming, lematization, urls, hashtags, mentions, unigrams, bigrams); for (int indTweet = 0; indTweet < tope; indTweet++) { topics.generateArffVector(bw, test.get(indTweet)); } bw.flush(); bw.close(); } int topeTopics = topics.getTopicsList().size(); topics.getTopicsList().clear(); // Genero el clasificador // FJRM 25-08-2013 Lo cambio de orden para intentar liberar la memoria de los topics y tener // más libre System.out.println("4-Generate classifier " + dateFormat.format(new Date())); out.println("4-Generate classifier " + dateFormat.format(new Date())); Classifier cls = null; DataSource sourceTrain = null; Instances dataTrain = null; if (new File(id + "-MNB.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id + "-MNB.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { sourceTrain = new DataSource(id + "-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); // Entreno el clasificador cls = new weka.classifiers.bayes.NaiveBayesMultinomial(); int clase = dataTrain.numAttributes() - 1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id + "-MNB.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); // data.delete();//no borro para el svm } // Ahora evaluo el clasificador con los datos de test System.out.println("6-Evaluate classifier MNB " + dateFormat.format(new Date())); out.println("6-Evaluate classifier MNB" + dateFormat.format(new Date())); DataSource sourceTest = new DataSource(id + "-test.arff"); Instances dataTest = sourceTest.getDataSet(); int clase = dataTest.numAttributes() - 1; dataTest.setClassIndex(clase); Evaluation eval = new Evaluation(dataTest); eval.evaluateModel(cls, dataTest); // Ahora calculo los valores precision, recall y fmeasure. Además saco las matrices de // confusion precision = 0; recall = 0; fmeasure = 0; for (int ind = 0; ind < topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); /* NO BORRAR System.out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); out.println("7-Evaluate classifier SVM"+dateFormat.format(new Date())); if (new File(id+"-SVM.classifier").exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(id+"-SVM.classifier")); cls = (Classifier) ois.readObject(); ois.close(); } else { if (dataTrain==null) { sourceTrain = new DataSource(id+"-train.arff"); dataTrain = sourceTrain.getDataSet(); if (dataTrain.classIndex() == -1) dataTrain.setClassIndex(dataTrain.numAttributes() - 1); } //Entreno el clasificador cls = new weka.classifiers.functions.LibSVM(); clase = dataTrain.numAttributes()-1; dataTrain.setClassIndex(clase); cls.buildClassifier(dataTrain); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(id+"-SVM.classifier")); oos.writeObject(cls); oos.flush(); oos.close(); dataTrain.delete(); } eval.evaluateModel(cls, dataTest); precision=0; recall=0; fmeasure=0; for(int ind=0; ind<topeTopics; ind++) { precision += eval.precision(ind); recall += eval.recall(ind); fmeasure += eval.fMeasure(ind); } precision = precision / topeTopics; recall = recall / topeTopics; fmeasure = fmeasure / topeTopics; System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.println(eval.toMatrixString()); System.out.println("+++++++++++++++++++++++++++++++++++++++"); System.out.printf("Precision: %.3f\n", precision); System.out.printf("Recall: %.3f\n", recall); System.out.printf("F-measure: %.3f\n", fmeasure); System.out.println("***************************************"); out.println("+++++++++++++++++++++++++++++++++++++++"); out.println(eval.toMatrixString()); out.println("+++++++++++++++++++++++++++++++++++++++"); out.printf("Precision: %.3f\n", precision); out.printf("Recall: %.3f\n", recall); out.printf("F-measure: %.3f\n", fmeasure); out.println("***************************************"); */ System.out.println("Done " + dateFormat.format(new Date())); out.println("Done " + dateFormat.format(new Date())); if (printer == null) { out.flush(); out.close(); } // Intento de liberar memoria if (dataTrain != null) dataTrain.delete(); if (dataTest != null) dataTest.delete(); if (train != null) train.clear(); if (test != null) test.clear(); if (topics != null) { topics.getTopicsList().clear(); topics = null; } if (dataTest != null) dataTest.delete(); if (cls != null) cls = null; if (tweetsProcessor != null) tweetsProcessor = null; System.gc(); } catch (Exception e) { e.printStackTrace(); } }
public static void main(String[] args) { if (args.length < 1) { System.out.println("usage: C4_5TweetTopicCategorization <root_path>"); System.exit(-1); } String rootPath = args[0]; File dataFolder = new File(rootPath + "/data"); String resultFolderPath = rootPath + "/results/C4_5/"; CrisisMailer crisisMailer = CrisisMailer.getCrisisMailer(); Logger logger = Logger.getLogger(C4_5TweetTopicCategorization.class); PropertyConfigurator.configure(Constants.LOG4J_PROPERTIES_FILE_PATH); File resultFolder = new File(resultFolderPath); if (!resultFolder.exists()) resultFolder.mkdir(); CSVLoader csvLoader = new CSVLoader(); try { for (File dataSetName : dataFolder.listFiles()) { Instances data = null; try { csvLoader.setSource(dataSetName); csvLoader.setStringAttributes("2"); data = csvLoader.getDataSet(); } catch (IOException ioe) { logger.error(ioe); crisisMailer.sendEmailAlert(ioe); System.exit(-1); } data.setClassIndex(data.numAttributes() - 1); data.deleteWithMissingClass(); Instances vectorizedData = null; StringToWordVector stringToWordVectorFilter = new StringToWordVector(); try { stringToWordVectorFilter.setInputFormat(data); stringToWordVectorFilter.setAttributeIndices("2"); stringToWordVectorFilter.setIDFTransform(true); stringToWordVectorFilter.setLowerCaseTokens(true); stringToWordVectorFilter.setOutputWordCounts(false); stringToWordVectorFilter.setUseStoplist(true); vectorizedData = Filter.useFilter(data, stringToWordVectorFilter); vectorizedData.deleteAttributeAt(0); // System.out.println(vectorizedData); } catch (Exception exception) { logger.error(exception); crisisMailer.sendEmailAlert(exception); System.exit(-1); } J48 j48Classifier = new J48(); /* FilteredClassifier filteredClassifier = new FilteredClassifier(); filteredClassifier.setFilter(stringToWordVectorFilter); filteredClassifier.setClassifier(j48Classifier); */ try { Evaluation eval = new Evaluation(vectorizedData); eval.crossValidateModel( j48Classifier, vectorizedData, 5, new Random(System.currentTimeMillis())); FileOutputStream resultOutputStream = new FileOutputStream(new File(resultFolderPath + dataSetName.getName())); resultOutputStream.write(eval.toSummaryString("=== Summary ===", false).getBytes()); resultOutputStream.write(eval.toMatrixString().getBytes()); resultOutputStream.write(eval.toClassDetailsString().getBytes()); resultOutputStream.close(); } catch (Exception exception) { logger.error(exception); crisisMailer.sendEmailAlert(exception); System.exit(-1); } } } catch (Exception exception) { logger.error(exception); crisisMailer.sendEmailAlert(exception); System.out.println(-1); } }
public double getLiblinear(String path, String train, String test) { // 本次精确度 double accuracy = 0.0; try { LibLINEAR c1 = new LibLINEAR(); // * String[] options=weka.core.Utils.splitOptions( // * "-S 1 -C 1.0 -E 0.001 -B 0"); c1.setOptions(options); ArffLoader atf = new ArffLoader(); File TraininputFile = new File(train); atf.setFile(TraininputFile); // 训练语料文件 Instances instancesTrain = atf.getDataSet(); // 读入训练文件 instancesTrain.setClassIndex(instancesTrain.numAttributes() - 1); File TestinputFile = new File(test); atf.setFile(TestinputFile); // 测试语料文件 Instances instancesTest = atf.getDataSet(); // 读入测试文件 // 设置分类属性所在行号(第一行为0号),instancesTest.numAttributes()可以取得属性总数 instancesTest.setClassIndex(instancesTest.numAttributes() - 1); c1.buildClassifier(instancesTrain); // 训练 Evaluation eval = new Evaluation(instancesTrain); eval.evaluateModel(c1, instancesTest); // eval.crossValidateModel(c1, instancesTrain, 10, new // Random(1)); File newfile = new File(path + "OutLiblinear_temp" + ".txt"); BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(newfile), "utf-8")); bufferedWriter.write(eval.toSummaryString() + "\r\n"); bufferedWriter.write(eval.toClassDetailsString() + "\r\n"); bufferedWriter.write(eval.toMatrixString() + "\r\n"); bufferedWriter.flush(); bufferedWriter.close(); BufferedReader bufferedReader = new BufferedReader(new FileReader(newfile)); String[] splitLineString = new String[5]; while (bufferedReader.ready()) { bufferedReader.readLine(); String lineString = bufferedReader.readLine(); splitLineString = lineString.split(" "); System.out.println(splitLineString[4]); break; } bufferedReader.close(); // 求分类准确度 String tempLine; BufferedReader tempBF = new BufferedReader(new FileReader(newfile)); while (tempBF.ready()) { tempLine = tempBF.readLine(); if (tempLine.contains("Correctly Classified Instances")) { tempLine = tempLine.substring(tempLine.lastIndexOf(".") - 2, tempLine.lastIndexOf(" ")); accuracy = Double.parseDouble(tempLine); break; } } tempBF.close(); } catch (Exception e) { System.out.println("Can't run linlinear of weka."); } return accuracy; }
/** * Accepts and processes a classifier encapsulated in an incremental classifier event * * @param ce an <code>IncrementalClassifierEvent</code> value */ @Override public void acceptClassifier(final IncrementalClassifierEvent ce) { try { if (ce.getStatus() == IncrementalClassifierEvent.NEW_BATCH) { m_throughput = new StreamThroughput(statusMessagePrefix()); m_throughput.setSamplePeriod(m_statusFrequency); // m_eval = new Evaluation(ce.getCurrentInstance().dataset()); m_eval = new Evaluation(ce.getStructure()); m_eval.useNoPriors(); m_dataLegend = new Vector(); m_reset = true; m_dataPoint = new double[0]; Instances inst = ce.getStructure(); System.err.println("NEW BATCH"); m_instanceCount = 0; if (m_windowSize > 0) { m_window = new LinkedList<Instance>(); m_windowEval = new Evaluation(ce.getStructure()); m_windowEval.useNoPriors(); m_windowedPreds = new LinkedList<double[]>(); if (m_logger != null) { m_logger.logMessage( statusMessagePrefix() + "[IncrementalClassifierEvaluator] Chart output using windowed " + "evaluation over " + m_windowSize + " instances"); } } /* * if (m_logger != null) { m_logger.statusMessage(statusMessagePrefix() * + "IncrementalClassifierEvaluator: started processing..."); * m_logger.logMessage(statusMessagePrefix() + * " [IncrementalClassifierEvaluator]" + statusMessagePrefix() + * " started processing..."); } */ } else { Instance inst = ce.getCurrentInstance(); if (inst != null) { m_throughput.updateStart(); m_instanceCount++; // if (inst.attribute(inst.classIndex()).isNominal()) { double[] dist = ce.getClassifier().distributionForInstance(inst); double pred = 0; if (!inst.isMissing(inst.classIndex())) { if (m_outputInfoRetrievalStats) { // store predictions so AUC etc can be output. m_eval.evaluateModelOnceAndRecordPrediction(dist, inst); } else { m_eval.evaluateModelOnce(dist, inst); } if (m_windowSize > 0) { m_windowEval.evaluateModelOnce(dist, inst); m_window.addFirst(inst); m_windowedPreds.addFirst(dist); if (m_instanceCount > m_windowSize) { // "forget" the oldest prediction Instance oldest = m_window.removeLast(); double[] oldDist = m_windowedPreds.removeLast(); oldest.setWeight(-oldest.weight()); m_windowEval.evaluateModelOnce(oldDist, oldest); oldest.setWeight(-oldest.weight()); } } } else { pred = ce.getClassifier().classifyInstance(inst); } if (inst.classIndex() >= 0) { // need to check that the class is not missing if (inst.attribute(inst.classIndex()).isNominal()) { if (!inst.isMissing(inst.classIndex())) { if (m_dataPoint.length < 2) { m_dataPoint = new double[3]; m_dataLegend.addElement("Accuracy"); m_dataLegend.addElement("RMSE (prob)"); m_dataLegend.addElement("Kappa"); } // int classV = (int) inst.value(inst.classIndex()); if (m_windowSize > 0) { m_dataPoint[1] = m_windowEval.rootMeanSquaredError(); m_dataPoint[2] = m_windowEval.kappa(); } else { m_dataPoint[1] = m_eval.rootMeanSquaredError(); m_dataPoint[2] = m_eval.kappa(); } // int maxO = Utils.maxIndex(dist); // if (maxO == classV) { // dist[classV] = -1; // maxO = Utils.maxIndex(dist); // } // m_dataPoint[1] -= dist[maxO]; } else { if (m_dataPoint.length < 1) { m_dataPoint = new double[1]; m_dataLegend.addElement("Confidence"); } } double primaryMeasure = 0; if (!inst.isMissing(inst.classIndex())) { if (m_windowSize > 0) { primaryMeasure = 1.0 - m_windowEval.errorRate(); } else { primaryMeasure = 1.0 - m_eval.errorRate(); } } else { // record confidence as the primary measure // (another possibility would be entropy of // the distribution, or perhaps average // confidence) primaryMeasure = dist[Utils.maxIndex(dist)]; } // double [] dataPoint = new double[1]; m_dataPoint[0] = primaryMeasure; // double min = 0; double max = 100; /* * ChartEvent e = new * ChartEvent(IncrementalClassifierEvaluator.this, m_dataLegend, * min, max, dataPoint); */ m_ce.setLegendText(m_dataLegend); m_ce.setMin(0); m_ce.setMax(1); m_ce.setDataPoint(m_dataPoint); m_ce.setReset(m_reset); m_reset = false; } else { // numeric class if (m_dataPoint.length < 1) { m_dataPoint = new double[1]; if (inst.isMissing(inst.classIndex())) { m_dataLegend.addElement("Prediction"); } else { m_dataLegend.addElement("RMSE"); } } if (!inst.isMissing(inst.classIndex())) { double update; if (!inst.isMissing(inst.classIndex())) { if (m_windowSize > 0) { update = m_windowEval.rootMeanSquaredError(); } else { update = m_eval.rootMeanSquaredError(); } } else { update = pred; } m_dataPoint[0] = update; if (update > m_max) { m_max = update; } if (update < m_min) { m_min = update; } } m_ce.setLegendText(m_dataLegend); m_ce.setMin((inst.isMissing(inst.classIndex()) ? m_min : 0)); m_ce.setMax(m_max); m_ce.setDataPoint(m_dataPoint); m_ce.setReset(m_reset); m_reset = false; } notifyChartListeners(m_ce); } m_throughput.updateEnd(m_logger); } if (ce.getStatus() == IncrementalClassifierEvent.BATCH_FINISHED || inst == null) { if (m_logger != null) { m_logger.logMessage( "[IncrementalClassifierEvaluator]" + statusMessagePrefix() + " Finished processing."); } m_throughput.finished(m_logger); // save memory if using windowed evaluation for charting m_windowEval = null; m_window = null; m_windowedPreds = null; if (m_textListeners.size() > 0) { String textTitle = ce.getClassifier().getClass().getName(); textTitle = textTitle.substring(textTitle.lastIndexOf('.') + 1, textTitle.length()); String results = "=== Performance information ===\n\n" + "Scheme: " + textTitle + "\n" + "Relation: " + m_eval.getHeader().relationName() + "\n\n" + m_eval.toSummaryString(); if (m_eval.getHeader().classIndex() >= 0 && m_eval.getHeader().classAttribute().isNominal() && (m_outputInfoRetrievalStats)) { results += "\n" + m_eval.toClassDetailsString(); } if (m_eval.getHeader().classIndex() >= 0 && m_eval.getHeader().classAttribute().isNominal()) { results += "\n" + m_eval.toMatrixString(); } textTitle = "Results: " + textTitle; TextEvent te = new TextEvent(this, results, textTitle); notifyTextListeners(te); } } } } catch (Exception ex) { if (m_logger != null) { m_logger.logMessage( "[IncrementalClassifierEvaluator]" + statusMessagePrefix() + " Error processing prediction " + ex.getMessage()); m_logger.statusMessage( statusMessagePrefix() + "ERROR: problem processing prediction (see log for details)"); } ex.printStackTrace(); stop(); } }